Files
crawler/feed.go
2026-02-04 21:00:23 -05:00

299 lines
7.7 KiB
Go

package main
import (
"strings"
"time"
"github.com/jackc/pgx/v5"
)
// isSpam checks if a feed should be marked IGNORE based on spam patterns
func isSpam(host, language, feedType string) bool {
// Never spam our own domain
if host == "1440.news" || strings.HasSuffix(host, ".1440.news") {
return false
}
// Bare TLD (no dot)
if !strings.Contains(host, ".") {
return true
}
// Domain starts with digit
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
return true
}
// Domain starts with letter-dash (e.g., "a-example.com")
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
return true
}
// No language or non-English
if language == "" || (language != "en" && !strings.HasPrefix(language, "en-")) {
return true
}
// Unknown feed type
if feedType != "rss" && feedType != "atom" && feedType != "json" {
return true
}
return false
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
Type string `json:"type"` // "rss", "atom", or "unknown"
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
// Timing
LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // when last checked
// Cache headers for conditional requests
ETag string `json:"etag,omitempty"`
LastModified string `json:"last_modified,omitempty"`
// Status: PUBLISH, STANDBY, IGNORE
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
// Consecutive checks with no new items (or failures)
MissCount int `json:"miss_count"`
}
// saveFeed stores a feed in PostgreSQL
func (c *Crawler) saveFeed(feed *Feed) error {
_, err := c.db.Exec(`
INSERT INTO feeds (
url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
ON CONFLICT(url) DO UPDATE SET
type = EXCLUDED.type,
title = EXCLUDED.title,
description = EXCLUDED.description,
language = EXCLUDED.language,
last_checked_at = EXCLUDED.last_checked_at,
etag = EXCLUDED.etag,
last_modified = EXCLUDED.last_modified,
last_error = EXCLUDED.last_error,
miss_count = EXCLUDED.miss_count
`,
feed.URL, feed.Type, NullableString(feed.Title), NullableString(feed.Description),
NullableString(feed.Language),
NullableTime(feed.LastCheckedAt),
NullableString(feed.ETag), NullableString(feed.LastModified),
feed.Status, NullableString(feed.LastError),
feed.MissCount,
)
return err
}
// getFeed retrieves a feed from PostgreSQL
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed := &Feed{}
var title, description, language *string
var lastCheckedAt *time.Time
var etag, lastModified, lastError *string
var status *string
var missCount *int
err := c.db.QueryRow(`
SELECT url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
FROM feeds WHERE url = $1
`, normalizeURL(feedURL)).Scan(
&feed.URL, &feed.Type, &title, &description, &language,
&lastCheckedAt,
&etag, &lastModified,
&status, &lastError,
&missCount,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
// Handle nullable fields
feed.Title = StringValue(title)
feed.Description = StringValue(description)
feed.Language = StringValue(language)
feed.LastCheckedAt = TimeValue(lastCheckedAt)
feed.ETag = StringValue(etag)
feed.LastModified = StringValue(lastModified)
feed.Status = StringValue(status)
if feed.Status == "" {
feed.Status = "STANDBY"
}
feed.LastError = StringValue(lastError)
if missCount != nil {
feed.MissCount = *missCount
}
return feed, nil
}
// feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool {
var exists bool
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = $1)", normalizeURL(feedURL)).Scan(&exists)
return err == nil && exists
}
// GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) {
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
return count, err
}
// GetFeedsDueForCheck returns feeds due for checking
// Check when: last_checked_at IS NULL OR now() > last_checked_at + 60s + (60s * miss_count)
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
FROM feeds
WHERE status IN ('PUBLISH', 'STANDBY')
AND (last_checked_at IS NULL
OR NOW() > last_checked_at + interval '1 second' * (60 + 60 * miss_count))
ORDER BY last_checked_at ASC NULLS FIRST
LIMIT $1
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetFeedsByHost returns all feeds from a specific host (searches URL)
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
pattern := "%" + host + "/%"
rows, err := c.db.Query(`
SELECT url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
FROM feeds WHERE LOWER(url) LIKE LOWER($1)
`, pattern)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
tsquery := ToSearchQuery(query)
rows, err := c.db.Query(`
SELECT url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
FROM feeds
WHERE search_vector @@ to_tsquery('english', $1)
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC
`, tsquery)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// scanFeeds is a helper to scan multiple feed rows
func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
var feeds []*Feed
for rows.Next() {
feed := &Feed{}
var feedType, title, description, language *string
var lastCheckedAt *time.Time
var etag, lastModified, lastError *string
var missCount *int
var status *string
if err := rows.Scan(
&feed.URL, &feedType, &title, &description, &language,
&lastCheckedAt,
&etag, &lastModified,
&status, &lastError,
&missCount,
); err != nil {
return nil, err
}
// Handle nullable fields
feed.Type = StringValue(feedType)
feed.Title = StringValue(title)
feed.Description = StringValue(description)
feed.Language = StringValue(language)
feed.LastCheckedAt = TimeValue(lastCheckedAt)
feed.ETag = StringValue(etag)
feed.LastModified = StringValue(lastModified)
feed.Status = StringValue(status)
if feed.Status == "" {
feed.Status = "STANDBY"
}
feed.LastError = StringValue(lastError)
if missCount != nil {
feed.MissCount = *missCount
}
feeds = append(feeds, feed)
}
return feeds, rows.Err()
}
// SetStatus sets the status for a feed ('PUBLISH', 'STANDBY', 'IGNORE')
func (c *Crawler) SetStatus(feedURL, status string) error {
feedURL = normalizeURL(feedURL)
_, err := c.db.Exec(`UPDATE feeds SET status = $1 WHERE url = $2`, status, feedURL)
return err
}
// GetFeedsByStatus returns all feeds with a specific status
func (c *Crawler) GetFeedsByStatus(status string, limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language,
last_checked_at,
etag, last_modified,
status, last_error,
miss_count
FROM feeds
WHERE status = $1
ORDER BY url
LIMIT $2
`, status, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}