515 lines
16 KiB
Go
515 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
// classifyFeed determines the category of a feed based on URL patterns
|
|
// Returns: "main", "comments", "category", "author", "article", "podcast"
|
|
// Note: podcast detection is also done in parseRSSMetadata based on content
|
|
func classifyFeed(feedURL string) string {
|
|
lower := strings.ToLower(feedURL)
|
|
|
|
// Comment feeds
|
|
if strings.Contains(lower, "/comment") {
|
|
return "comments"
|
|
}
|
|
|
|
// Podcast URL patterns
|
|
podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
|
|
for _, pattern := range podcastPatterns {
|
|
if strings.Contains(lower, pattern) {
|
|
return "podcast"
|
|
}
|
|
}
|
|
|
|
u, err := url.Parse(feedURL)
|
|
if err != nil {
|
|
return "main"
|
|
}
|
|
|
|
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
|
|
|
// Author feeds
|
|
if strings.Contains(path, "/author/") {
|
|
return "author"
|
|
}
|
|
|
|
// Category/tag feeds
|
|
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
|
|
for _, pattern := range categoryPatterns {
|
|
if strings.Contains(path, pattern) {
|
|
return "category"
|
|
}
|
|
}
|
|
|
|
// Check for article feeds (path ending in /feed with content before it)
|
|
if strings.HasSuffix(path, "/feed") {
|
|
basePath := strings.TrimSuffix(path, "/feed")
|
|
basePath = strings.Trim(basePath, "/")
|
|
|
|
if basePath == "" {
|
|
return "main" // Just /feed - main feed
|
|
}
|
|
|
|
// Article if path contains date patterns
|
|
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
|
return "article"
|
|
}
|
|
|
|
// Article if path has multiple segments (nested content)
|
|
segments := strings.Split(basePath, "/")
|
|
if len(segments) >= 2 {
|
|
return "article"
|
|
}
|
|
|
|
// Article if single segment looks like an article slug
|
|
if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
|
|
return "article"
|
|
}
|
|
}
|
|
|
|
return "main"
|
|
}
|
|
|
|
// classifyFeedByTitle refines category based on feed title (called after parsing)
|
|
func classifyFeedByTitle(title string, currentCategory string) string {
|
|
if currentCategory != "main" {
|
|
return currentCategory // Already classified by URL
|
|
}
|
|
lower := strings.ToLower(title)
|
|
if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
|
|
return "comments"
|
|
}
|
|
return currentCategory
|
|
}
|
|
|
|
// Feed represents a discovered RSS/Atom feed with metadata
|
|
type Feed struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // "rss", "atom", or "unknown"
|
|
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
|
|
Title string `json:"title,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
|
|
|
|
// Timing
|
|
DiscoveredAt time.Time `json:"discovered_at"`
|
|
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
|
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
|
|
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
|
|
|
// Cache headers for conditional requests
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified string `json:"last_modified,omitempty"`
|
|
|
|
// Health tracking
|
|
Status string `json:"status"` // "pass", "hold", "skip"
|
|
LastError string `json:"last_error,omitempty"`
|
|
LastErrorAt time.Time `json:"last_error_at,omitempty"`
|
|
|
|
// Discovery source
|
|
SourceURL string `json:"source_url,omitempty"`
|
|
SourceHost string `json:"source_host,omitempty"`
|
|
TLD string `json:"tld,omitempty"`
|
|
|
|
// Content stats
|
|
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
|
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
|
|
|
// Adaptive check interval
|
|
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
|
|
|
// Publishing to PDS
|
|
PublishStatus string `json:"publish_status"` // "hold", "pass", "skip"
|
|
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
|
|
}
|
|
|
|
// saveFeed stores a feed in PostgreSQL
|
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
|
// Default publishStatus to "hold" if not set
|
|
// Auto-skip feeds with no language or non-English language
|
|
// Auto-pass feeds from our own domain
|
|
publishStatus := feed.PublishStatus
|
|
if publishStatus == "" {
|
|
if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
|
|
publishStatus = "pass"
|
|
} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
|
|
publishStatus = "skip"
|
|
} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
|
|
publishStatus = "skip"
|
|
} else {
|
|
publishStatus = "hold"
|
|
}
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO feeds (
|
|
url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25)
|
|
ON CONFLICT(url) DO UPDATE SET
|
|
type = EXCLUDED.type,
|
|
category = EXCLUDED.category,
|
|
title = EXCLUDED.title,
|
|
description = EXCLUDED.description,
|
|
language = EXCLUDED.language,
|
|
site_url = EXCLUDED.site_url,
|
|
last_crawled_at = EXCLUDED.last_crawled_at,
|
|
next_crawl_at = EXCLUDED.next_crawl_at,
|
|
last_build_date = EXCLUDED.last_build_date,
|
|
etag = EXCLUDED.etag,
|
|
last_modified = EXCLUDED.last_modified,
|
|
status = EXCLUDED.status,
|
|
last_error = EXCLUDED.last_error,
|
|
last_error_at = EXCLUDED.last_error_at,
|
|
item_count = EXCLUDED.item_count,
|
|
oldest_item_date = EXCLUDED.oldest_item_date,
|
|
newest_item_date = EXCLUDED.newest_item_date,
|
|
no_update = EXCLUDED.no_update,
|
|
publish_status = EXCLUDED.publish_status,
|
|
publish_account = EXCLUDED.publish_account
|
|
`,
|
|
feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description),
|
|
NullableString(feed.Language), NullableString(feed.SiteURL),
|
|
feed.DiscoveredAt, NullableTime(feed.LastCrawledAt), NullableTime(feed.NextCrawlAt), NullableTime(feed.LastBuildDate),
|
|
NullableString(feed.ETag), NullableString(feed.LastModified),
|
|
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
|
|
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
|
|
feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate),
|
|
feed.NoUpdate,
|
|
publishStatus, NullableString(feed.PublishAccount),
|
|
)
|
|
return err
|
|
}
|
|
|
|
// getFeed retrieves a feed from PostgreSQL
|
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|
feed := &Feed{}
|
|
var category, title, description, language, siteURL *string
|
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
|
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
|
var publishStatus, publishAccount *string
|
|
var itemCount, noUpdate *int
|
|
|
|
err := c.db.QueryRow(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds WHERE url = $1
|
|
`, normalizeURL(feedURL)).Scan(
|
|
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
|
&etag, &lastModified,
|
|
&feed.Status, &lastError, &lastErrorAt,
|
|
&sourceURL, &sourceHost, &tld,
|
|
&itemCount, &oldestItemDate, &newestItemDate,
|
|
&noUpdate,
|
|
&publishStatus, &publishAccount,
|
|
)
|
|
|
|
if err == pgx.ErrNoRows {
|
|
return nil, nil
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Handle nullable fields
|
|
if category != nil {
|
|
feed.Category = *category
|
|
} else {
|
|
feed.Category = "main"
|
|
}
|
|
feed.Title = StringValue(title)
|
|
feed.Description = StringValue(description)
|
|
feed.Language = StringValue(language)
|
|
feed.SiteURL = StringValue(siteURL)
|
|
feed.LastCrawledAt = TimeValue(lastCrawledAt)
|
|
feed.NextCrawlAt = TimeValue(nextCrawlAt)
|
|
feed.LastBuildDate = TimeValue(lastBuildDate)
|
|
feed.ETag = StringValue(etag)
|
|
feed.LastModified = StringValue(lastModified)
|
|
feed.LastError = StringValue(lastError)
|
|
feed.LastErrorAt = TimeValue(lastErrorAt)
|
|
feed.SourceURL = StringValue(sourceURL)
|
|
feed.SourceHost = StringValue(sourceHost)
|
|
feed.TLD = StringValue(tld)
|
|
if itemCount != nil {
|
|
feed.ItemCount = *itemCount
|
|
}
|
|
feed.OldestItemDate = TimeValue(oldestItemDate)
|
|
feed.NewestItemDate = TimeValue(newestItemDate)
|
|
if noUpdate != nil {
|
|
feed.NoUpdate = *noUpdate
|
|
}
|
|
if publishStatus != nil {
|
|
feed.PublishStatus = *publishStatus
|
|
} else {
|
|
feed.PublishStatus = "hold"
|
|
}
|
|
feed.PublishAccount = StringValue(publishAccount)
|
|
|
|
return feed, nil
|
|
}
|
|
|
|
// feedExists checks if a feed URL already exists in the database
|
|
func (c *Crawler) feedExists(feedURL string) bool {
|
|
var exists bool
|
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = $1)", normalizeURL(feedURL)).Scan(&exists)
|
|
return err == nil && exists
|
|
}
|
|
|
|
// GetAllFeeds returns all feeds from the database
|
|
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds
|
|
`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetFeedCount returns the total number of feeds in the database
|
|
func (c *Crawler) GetFeedCount() (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
|
|
return count, err
|
|
}
|
|
|
|
// GetFeedCountByHost returns the number of feeds for a specific host
|
|
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE source_host = $1", host).Scan(&count)
|
|
return count, err
|
|
}
|
|
|
|
// GetFeedsDueForCheck returns feeds where next_crawl_at <= now, ordered by no_update desc (prioritize infrequent feeds)
|
|
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds
|
|
WHERE next_crawl_at <= NOW() AND status = 'pass'
|
|
ORDER BY no_update DESC
|
|
LIMIT $1
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetFeedsByHost returns all feeds from a specific host
|
|
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds WHERE source_host = $1
|
|
`, host)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// SearchFeeds performs a full-text search on feeds
|
|
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
|
tsquery := ToSearchQuery(query)
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds
|
|
WHERE search_vector @@ to_tsquery('english', $1)
|
|
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC
|
|
`, tsquery)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// scanFeeds is a helper to scan multiple feed rows
|
|
func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
|
var feeds []*Feed
|
|
|
|
for rows.Next() {
|
|
feed := &Feed{}
|
|
var feedType, category, title, description, language, siteURL *string
|
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
|
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
|
var itemCount, noUpdate *int
|
|
var status *string
|
|
var publishStatus, publishAccount *string
|
|
|
|
if err := rows.Scan(
|
|
&feed.URL, &feedType, &category, &title, &description, &language, &siteURL,
|
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
|
&etag, &lastModified,
|
|
&status, &lastError, &lastErrorAt,
|
|
&sourceURL, &sourceHost, &tld,
|
|
&itemCount, &oldestItemDate, &newestItemDate,
|
|
&noUpdate,
|
|
&publishStatus, &publishAccount,
|
|
); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Handle nullable fields
|
|
feed.Type = StringValue(feedType)
|
|
if category != nil && *category != "" {
|
|
feed.Category = *category
|
|
} else {
|
|
feed.Category = "main"
|
|
}
|
|
feed.Title = StringValue(title)
|
|
feed.Description = StringValue(description)
|
|
feed.Language = StringValue(language)
|
|
feed.SiteURL = StringValue(siteURL)
|
|
feed.LastCrawledAt = TimeValue(lastCrawledAt)
|
|
feed.NextCrawlAt = TimeValue(nextCrawlAt)
|
|
feed.LastBuildDate = TimeValue(lastBuildDate)
|
|
feed.ETag = StringValue(etag)
|
|
feed.LastModified = StringValue(lastModified)
|
|
feed.Status = StringValue(status)
|
|
feed.LastError = StringValue(lastError)
|
|
feed.LastErrorAt = TimeValue(lastErrorAt)
|
|
feed.SourceURL = StringValue(sourceURL)
|
|
feed.SourceHost = StringValue(sourceHost)
|
|
feed.TLD = StringValue(tld)
|
|
if itemCount != nil {
|
|
feed.ItemCount = *itemCount
|
|
}
|
|
feed.OldestItemDate = TimeValue(oldestItemDate)
|
|
feed.NewestItemDate = TimeValue(newestItemDate)
|
|
if noUpdate != nil {
|
|
feed.NoUpdate = *noUpdate
|
|
}
|
|
if publishStatus != nil {
|
|
feed.PublishStatus = *publishStatus
|
|
} else {
|
|
feed.PublishStatus = "hold"
|
|
}
|
|
feed.PublishAccount = StringValue(publishAccount)
|
|
|
|
feeds = append(feeds, feed)
|
|
}
|
|
|
|
return feeds, rows.Err()
|
|
}
|
|
|
|
// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
|
|
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
|
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
|
feedURL = normalizeURL(feedURL)
|
|
|
|
// Auto-derive account if passing and not provided
|
|
if status == "pass" && account == "" {
|
|
account = DeriveHandleFromFeed(feedURL)
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
UPDATE feeds SET publish_status = $1, publish_account = $2 WHERE url = $3
|
|
`, status, NullableString(account), feedURL)
|
|
return err
|
|
}
|
|
|
|
// GetFeedsByPublishStatus returns all feeds with a specific publish status
|
|
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds
|
|
WHERE publish_status = $1
|
|
`, status)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetPublishCandidates returns feeds that are hold for review and have items
|
|
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, site_url,
|
|
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
|
etag, last_modified,
|
|
status, last_error, last_error_at,
|
|
source_url, source_host, tld,
|
|
item_count, oldest_item_date, newest_item_date,
|
|
no_update,
|
|
publish_status, publish_account
|
|
FROM feeds
|
|
WHERE publish_status = 'hold' AND item_count > 0 AND status = 'pass'
|
|
ORDER BY item_count DESC
|
|
LIMIT $1
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|