299 lines
7.7 KiB
Go
299 lines
7.7 KiB
Go
package main
|
|
|
|
import (
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
// isSpam checks if a feed should be marked IGNORE based on spam patterns
|
|
func isSpam(host, language, feedType string) bool {
|
|
// Never spam our own domain
|
|
if host == "1440.news" || strings.HasSuffix(host, ".1440.news") {
|
|
return false
|
|
}
|
|
|
|
// Bare TLD (no dot)
|
|
if !strings.Contains(host, ".") {
|
|
return true
|
|
}
|
|
|
|
// Domain starts with digit
|
|
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
|
|
return true
|
|
}
|
|
|
|
// Domain starts with letter-dash (e.g., "a-example.com")
|
|
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
|
|
return true
|
|
}
|
|
|
|
// No language or non-English
|
|
if language == "" || (language != "en" && !strings.HasPrefix(language, "en-")) {
|
|
return true
|
|
}
|
|
|
|
// Unknown feed type
|
|
if feedType != "rss" && feedType != "atom" && feedType != "json" {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// Feed represents a discovered RSS/Atom feed with metadata
|
|
type Feed struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // "rss", "atom", or "unknown"
|
|
Title string `json:"title,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
|
|
// Timing
|
|
LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // when last checked
|
|
|
|
// Cache headers for conditional requests
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified string `json:"last_modified,omitempty"`
|
|
|
|
// Status: PUBLISH, STANDBY, IGNORE
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
|
|
// Consecutive checks with no new items (or failures)
|
|
MissCount int `json:"miss_count"`
|
|
}
|
|
|
|
// saveFeed stores a feed in PostgreSQL
|
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO feeds (
|
|
url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
|
ON CONFLICT(url) DO UPDATE SET
|
|
type = EXCLUDED.type,
|
|
title = EXCLUDED.title,
|
|
description = EXCLUDED.description,
|
|
language = EXCLUDED.language,
|
|
last_checked_at = EXCLUDED.last_checked_at,
|
|
etag = EXCLUDED.etag,
|
|
last_modified = EXCLUDED.last_modified,
|
|
last_error = EXCLUDED.last_error,
|
|
miss_count = EXCLUDED.miss_count
|
|
`,
|
|
feed.URL, feed.Type, NullableString(feed.Title), NullableString(feed.Description),
|
|
NullableString(feed.Language),
|
|
NullableTime(feed.LastCheckedAt),
|
|
NullableString(feed.ETag), NullableString(feed.LastModified),
|
|
feed.Status, NullableString(feed.LastError),
|
|
feed.MissCount,
|
|
)
|
|
return err
|
|
}
|
|
|
|
// getFeed retrieves a feed from PostgreSQL
|
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|
feed := &Feed{}
|
|
var title, description, language *string
|
|
var lastCheckedAt *time.Time
|
|
var etag, lastModified, lastError *string
|
|
var status *string
|
|
var missCount *int
|
|
|
|
err := c.db.QueryRow(`
|
|
SELECT url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
FROM feeds WHERE url = $1
|
|
`, normalizeURL(feedURL)).Scan(
|
|
&feed.URL, &feed.Type, &title, &description, &language,
|
|
&lastCheckedAt,
|
|
&etag, &lastModified,
|
|
&status, &lastError,
|
|
&missCount,
|
|
)
|
|
|
|
if err == pgx.ErrNoRows {
|
|
return nil, nil
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Handle nullable fields
|
|
feed.Title = StringValue(title)
|
|
feed.Description = StringValue(description)
|
|
feed.Language = StringValue(language)
|
|
feed.LastCheckedAt = TimeValue(lastCheckedAt)
|
|
feed.ETag = StringValue(etag)
|
|
feed.LastModified = StringValue(lastModified)
|
|
feed.Status = StringValue(status)
|
|
if feed.Status == "" {
|
|
feed.Status = "STANDBY"
|
|
}
|
|
feed.LastError = StringValue(lastError)
|
|
if missCount != nil {
|
|
feed.MissCount = *missCount
|
|
}
|
|
|
|
return feed, nil
|
|
}
|
|
|
|
// feedExists checks if a feed URL already exists in the database
|
|
func (c *Crawler) feedExists(feedURL string) bool {
|
|
var exists bool
|
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = $1)", normalizeURL(feedURL)).Scan(&exists)
|
|
return err == nil && exists
|
|
}
|
|
|
|
// GetFeedCount returns the total number of feeds in the database
|
|
func (c *Crawler) GetFeedCount() (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
|
|
return count, err
|
|
}
|
|
|
|
// GetFeedsDueForCheck returns feeds due for checking
|
|
// Check when: last_checked_at IS NULL OR now() > last_checked_at + 60s + (60s * miss_count)
|
|
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
FROM feeds
|
|
WHERE status IN ('PUBLISH', 'STANDBY')
|
|
AND (last_checked_at IS NULL
|
|
OR NOW() > last_checked_at + interval '1 second' * (60 + 60 * miss_count))
|
|
ORDER BY last_checked_at ASC NULLS FIRST
|
|
LIMIT $1
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetFeedsByHost returns all feeds from a specific host (searches URL)
|
|
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
|
pattern := "%" + host + "/%"
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
FROM feeds WHERE LOWER(url) LIKE LOWER($1)
|
|
`, pattern)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// SearchFeeds performs a full-text search on feeds
|
|
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
|
tsquery := ToSearchQuery(query)
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
FROM feeds
|
|
WHERE search_vector @@ to_tsquery('english', $1)
|
|
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC
|
|
`, tsquery)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// scanFeeds is a helper to scan multiple feed rows
|
|
func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
|
var feeds []*Feed
|
|
|
|
for rows.Next() {
|
|
feed := &Feed{}
|
|
var feedType, title, description, language *string
|
|
var lastCheckedAt *time.Time
|
|
var etag, lastModified, lastError *string
|
|
var missCount *int
|
|
var status *string
|
|
|
|
if err := rows.Scan(
|
|
&feed.URL, &feedType, &title, &description, &language,
|
|
&lastCheckedAt,
|
|
&etag, &lastModified,
|
|
&status, &lastError,
|
|
&missCount,
|
|
); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Handle nullable fields
|
|
feed.Type = StringValue(feedType)
|
|
feed.Title = StringValue(title)
|
|
feed.Description = StringValue(description)
|
|
feed.Language = StringValue(language)
|
|
feed.LastCheckedAt = TimeValue(lastCheckedAt)
|
|
feed.ETag = StringValue(etag)
|
|
feed.LastModified = StringValue(lastModified)
|
|
feed.Status = StringValue(status)
|
|
if feed.Status == "" {
|
|
feed.Status = "STANDBY"
|
|
}
|
|
feed.LastError = StringValue(lastError)
|
|
if missCount != nil {
|
|
feed.MissCount = *missCount
|
|
}
|
|
|
|
feeds = append(feeds, feed)
|
|
}
|
|
|
|
return feeds, rows.Err()
|
|
}
|
|
|
|
// SetStatus sets the status for a feed ('PUBLISH', 'STANDBY', 'IGNORE')
|
|
func (c *Crawler) SetStatus(feedURL, status string) error {
|
|
feedURL = normalizeURL(feedURL)
|
|
_, err := c.db.Exec(`UPDATE feeds SET status = $1 WHERE url = $2`, status, feedURL)
|
|
return err
|
|
}
|
|
|
|
// GetFeedsByStatus returns all feeds with a specific status
|
|
func (c *Crawler) GetFeedsByStatus(status string, limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, title, description, language,
|
|
last_checked_at,
|
|
etag, last_modified,
|
|
status, last_error,
|
|
miss_count
|
|
FROM feeds
|
|
WHERE status = $1
|
|
ORDER BY url
|
|
LIMIT $2
|
|
`, status, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|