package main import ( "net/url" "regexp" "strings" "time" "github.com/jackc/pgx/v5" ) // classifyFeed determines the category of a feed based on URL patterns // Returns: "main", "comments", "category", "author", "article", "podcast" // Note: podcast detection is also done in parseRSSMetadata based on content func classifyFeed(feedURL string) string { lower := strings.ToLower(feedURL) // Comment feeds if strings.Contains(lower, "/comment") { return "comments" } // Podcast URL patterns podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"} for _, pattern := range podcastPatterns { if strings.Contains(lower, pattern) { return "podcast" } } u, err := url.Parse(feedURL) if err != nil { return "main" } path := strings.ToLower(strings.TrimSuffix(u.Path, "/")) // Author feeds if strings.Contains(path, "/author/") { return "author" } // Category/tag feeds categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"} for _, pattern := range categoryPatterns { if strings.Contains(path, pattern) { return "category" } } // Check for article feeds (path ending in /feed with content before it) if strings.HasSuffix(path, "/feed") { basePath := strings.TrimSuffix(path, "/feed") basePath = strings.Trim(basePath, "/") if basePath == "" { return "main" // Just /feed - main feed } // Article if path contains date patterns if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched { return "article" } // Article if path has multiple segments (nested content) segments := strings.Split(basePath, "/") if len(segments) >= 2 { return "article" } // Article if single segment looks like an article slug if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 { return "article" } } return "main" } // classifyFeedByTitle refines category based on feed title (called after parsing) func classifyFeedByTitle(title string, currentCategory string) string { if currentCategory != "main" { return currentCategory // Already classified by URL } lower := strings.ToLower(title) if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") { return "comments" } return currentCategory } // Feed represents a discovered RSS/Atom feed with metadata type Feed struct { URL string `json:"url"` Type string `json:"type"` // "rss", "atom", or "unknown" Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast" Title string `json:"title,omitempty"` Description string `json:"description,omitempty"` Language string `json:"language,omitempty"` SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to // Timing DiscoveredAt time.Time `json:"discovered_at"` LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated // Cache headers for conditional requests ETag string `json:"etag,omitempty"` LastModified string `json:"last_modified,omitempty"` // Health tracking Status string `json:"status"` // "pass", "hold", "skip" LastError string `json:"last_error,omitempty"` LastErrorAt time.Time `json:"last_error_at,omitempty"` // Discovery source SourceURL string `json:"source_url,omitempty"` DomainHost string `json:"domain_host,omitempty"` DomainTLD string `json:"domain_tld,omitempty"` // Content stats ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check OldestItemDate time.Time `json:"oldest_item_date,omitempty"` NewestItemDate time.Time `json:"newest_item_date,omitempty"` // Adaptive check interval NoUpdate int `json:"no_update"` // Consecutive checks with no change // Publishing to PDS PublishStatus string `json:"publish_status"` // "hold", "pass", "skip" PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news" } // saveFeed stores a feed in PostgreSQL func (c *Crawler) saveFeed(feed *Feed) error { // Default publishStatus to "hold" if not set // Auto-skip feeds with no language or non-English language // Auto-pass feeds from our own domain publishStatus := feed.PublishStatus if publishStatus == "" { if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" { publishStatus = "pass" } else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) { publishStatus = "skip" } else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" { publishStatus = "skip" } else { publishStatus = "hold" } } _, err := c.db.Exec(` INSERT INTO feeds ( url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25) ON CONFLICT(url) DO UPDATE SET type = EXCLUDED.type, category = EXCLUDED.category, title = EXCLUDED.title, description = EXCLUDED.description, language = EXCLUDED.language, site_url = EXCLUDED.site_url, last_checked_at = EXCLUDED.last_checked_at, next_check_at = EXCLUDED.next_check_at, last_build_date = EXCLUDED.last_build_date, etag = EXCLUDED.etag, last_modified = EXCLUDED.last_modified, status = EXCLUDED.status, last_error = EXCLUDED.last_error, last_error_at = EXCLUDED.last_error_at, item_count = EXCLUDED.item_count, oldest_item_date = EXCLUDED.oldest_item_date, newest_item_date = EXCLUDED.newest_item_date, no_update = EXCLUDED.no_update, publish_status = EXCLUDED.publish_status, publish_account = EXCLUDED.publish_account `, feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description), NullableString(feed.Language), NullableString(feed.SiteURL), feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate), NullableString(feed.ETag), NullableString(feed.LastModified), feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt), NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD), feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate), feed.NoUpdate, publishStatus, NullableString(feed.PublishAccount), ) return err } // getFeed retrieves a feed from PostgreSQL func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed := &Feed{} var category, title, description, language, siteURL *string var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string var publishStatus, publishAccount *string var itemCount, noUpdate *int err := c.db.QueryRow(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE url = $1 `, normalizeURL(feedURL)).Scan( &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &feed.Status, &lastError, &lastErrorAt, &sourceURL, &domainHost, &domainTLD, &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, ) if err == pgx.ErrNoRows { return nil, nil } if err != nil { return nil, err } // Handle nullable fields if category != nil { feed.Category = *category } else { feed.Category = "main" } feed.Title = StringValue(title) feed.Description = StringValue(description) feed.Language = StringValue(language) feed.SiteURL = StringValue(siteURL) feed.LastCheckedAt = TimeValue(lastCheckedAt) feed.NextCheckAt = TimeValue(nextCheckAt) feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) feed.DomainHost = StringValue(domainHost) feed.DomainTLD = StringValue(domainTLD) if itemCount != nil { feed.ItemCount = *itemCount } feed.OldestItemDate = TimeValue(oldestItemDate) feed.NewestItemDate = TimeValue(newestItemDate) if noUpdate != nil { feed.NoUpdate = *noUpdate } if publishStatus != nil { feed.PublishStatus = *publishStatus } else { feed.PublishStatus = "hold" } feed.PublishAccount = StringValue(publishAccount) return feed, nil } // feedExists checks if a feed URL already exists in the database func (c *Crawler) feedExists(feedURL string) bool { var exists bool err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = $1)", normalizeURL(feedURL)).Scan(&exists) return err == nil && exists } // GetAllFeeds returns all feeds from the database func (c *Crawler) GetAllFeeds() ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds `) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) } // GetFeedCount returns the total number of feeds in the database func (c *Crawler) GetFeedCount() (int, error) { var count int err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count) return count, err } // GetFeedCountByHost returns the number of feeds for a specific host func (c *Crawler) GetFeedCountByHost(host string) (int, error) { var count int err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count) return count, err } // GetFeedsDueForCheck returns feeds for feed_check, ordered by last_checked_at ASC (oldest first) func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE last_checked_at > '0001-01-01 00:00:00' AND status = 'pass' ORDER BY last_checked_at ASC LIMIT $1 `, limit) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) } // GetFeedsByHost returns all feeds from a specific host func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE domain_host = $1 `, host) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) } // SearchFeeds performs a full-text search on feeds func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { tsquery := ToSearchQuery(query) rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE search_vector @@ to_tsquery('english', $1) ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC `, tsquery) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) } // scanFeeds is a helper to scan multiple feed rows func scanFeeds(rows pgx.Rows) ([]*Feed, error) { var feeds []*Feed for rows.Next() { feed := &Feed{} var feedType, category, title, description, language, siteURL *string var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string var itemCount, noUpdate *int var status *string var publishStatus, publishAccount *string if err := rows.Scan( &feed.URL, &feedType, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &status, &lastError, &lastErrorAt, &sourceURL, &domainHost, &domainTLD, &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, ); err != nil { return nil, err } // Handle nullable fields feed.Type = StringValue(feedType) if category != nil && *category != "" { feed.Category = *category } else { feed.Category = "main" } feed.Title = StringValue(title) feed.Description = StringValue(description) feed.Language = StringValue(language) feed.SiteURL = StringValue(siteURL) feed.LastCheckedAt = TimeValue(lastCheckedAt) feed.NextCheckAt = TimeValue(nextCheckAt) feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) feed.Status = StringValue(status) feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) feed.DomainHost = StringValue(domainHost) feed.DomainTLD = StringValue(domainTLD) if itemCount != nil { feed.ItemCount = *itemCount } feed.OldestItemDate = TimeValue(oldestItemDate) feed.NewestItemDate = TimeValue(newestItemDate) if noUpdate != nil { feed.NoUpdate = *noUpdate } if publishStatus != nil { feed.PublishStatus = *publishStatus } else { feed.PublishStatus = "hold" } feed.PublishAccount = StringValue(publishAccount) feeds = append(feeds, feed) } return feeds, rows.Err() } // SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip') func (c *Crawler) SetPublishStatus(feedURL, status, account string) error { feedURL = normalizeURL(feedURL) _, err := c.db.Exec(` UPDATE feeds SET publish_status = $1, publish_account = $2 WHERE url = $3 `, status, NullableString(account), feedURL) return err } // GetFeedsByPublishStatus returns all feeds with a specific publish status func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE publish_status = $1 `, status) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) } // GetPublishCandidates returns feeds that are hold for review and have items func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE publish_status = 'hold' AND item_count > 0 AND status = 'pass' ORDER BY item_count DESC LIMIT $1 `, limit) if err != nil { return nil, err } defer rows.Close() return scanFeeds(rows) }