crawler/feed.go

package main

import (
	"net/url"
	"regexp"
	"strings"
	"time"

	"github.com/jackc/pgx/v5"
)

// classifyFeed determines the category of a feed based on URL patterns
// Returns: "main", "comments", "category", "author", "article", "podcast"
// Note: podcast detection is also done in parseRSSMetadata based on content
func classifyFeed(feedURL string) string {
	lower := strings.ToLower(feedURL)

	// Comment feeds
	if strings.Contains(lower, "/comment") {
		return "comments"
	}

	// Podcast URL patterns
	podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
	for _, pattern := range podcastPatterns {
		if strings.Contains(lower, pattern) {
			return "podcast"
		}
	}

	u, err := url.Parse(feedURL)
	if err != nil {
		return "main"
	}

	path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))

	// Author feeds
	if strings.Contains(path, "/author/") {
		return "author"
	}

	// Category/tag feeds
	categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
	for _, pattern := range categoryPatterns {
		if strings.Contains(path, pattern) {
			return "category"
		}
	}

	// Check for article feeds (path ending in /feed with content before it)
	if strings.HasSuffix(path, "/feed") {
		basePath := strings.TrimSuffix(path, "/feed")
		basePath = strings.Trim(basePath, "/")

		if basePath == "" {
			return "main" // Just /feed - main feed
		}

		// Article if path contains date patterns
		if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
			return "article"
		}

		// Article if path has multiple segments (nested content)
		segments := strings.Split(basePath, "/")
		if len(segments) >= 2 {
			return "article"
		}

		// Article if single segment looks like an article slug
		if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
			return "article"
		}
	}

	return "main"
}

// classifyFeedByTitle refines category based on feed title (called after parsing)
func classifyFeedByTitle(title string, currentCategory string) string {
	if currentCategory != "main" {
		return currentCategory // Already classified by URL
	}
	lower := strings.ToLower(title)
	if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
		return "comments"
	}
	return currentCategory
}

// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
	URL         string `json:"url"`
	Type        string `json:"type"`     // "rss", "atom", or "unknown"
	Category    string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
	Title       string `json:"title,omitempty"`
	Description string `json:"description,omitempty"`
	Language    string `json:"language,omitempty"`
	SiteURL     string `json:"site_url,omitempty"` // The website the feed belongs to

	// Timing
	DiscoveredAt  time.Time `json:"discovered_at"`
	LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked
	NextCheckAt   time.Time `json:"next_check_at,omitempty"`   // feed_check: when to next check
	LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated

	// Cache headers for conditional requests
	ETag         string `json:"etag,omitempty"`
	LastModified string `json:"last_modified,omitempty"`

	// Health tracking
	Status      string    `json:"status"` // "pass", "hold", "skip"
	LastError   string    `json:"last_error,omitempty"`
	LastErrorAt time.Time `json:"last_error_at,omitempty"`

	// Discovery source
	SourceURL  string `json:"source_url,omitempty"`
	DomainHost string `json:"domain_host,omitempty"`
	DomainTLD  string `json:"domain_tld,omitempty"`

	// Content stats
	ItemCount      int       `json:"item_count,omitempty"` // Number of items in last feed_check
	OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
	NewestItemDate time.Time `json:"newest_item_date,omitempty"`

	// Adaptive check interval
	NoUpdate int `json:"no_update"` // Consecutive checks with no change

	// Publishing to PDS
	PublishStatus  string `json:"publish_status"`            // "hold", "pass", "skip"
	PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
}

// saveFeed stores a feed in PostgreSQL
func (c *Crawler) saveFeed(feed *Feed) error {
	// Default publishStatus to "hold" if not set
	// Auto-skip feeds with no language or non-English language
	// Auto-pass feeds from our own domain
	publishStatus := feed.PublishStatus
	if publishStatus == "" {
		if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" {
			publishStatus = "pass"
		} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
			publishStatus = "skip"
		} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
			publishStatus = "skip"
		} else {
			publishStatus = "hold"
		}
	}

	_, err := c.db.Exec(`
		INSERT INTO feeds (
			url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25)
		ON CONFLICT(url) DO UPDATE SET
			type = EXCLUDED.type,
			category = EXCLUDED.category,
			title = EXCLUDED.title,
			description = EXCLUDED.description,
			language = EXCLUDED.language,
			site_url = EXCLUDED.site_url,
			last_checked_at = EXCLUDED.last_checked_at,
			next_check_at = EXCLUDED.next_check_at,
			last_build_date = EXCLUDED.last_build_date,
			etag = EXCLUDED.etag,
			last_modified = EXCLUDED.last_modified,
			status = EXCLUDED.status,
			last_error = EXCLUDED.last_error,
			last_error_at = EXCLUDED.last_error_at,
			item_count = EXCLUDED.item_count,
			oldest_item_date = EXCLUDED.oldest_item_date,
			newest_item_date = EXCLUDED.newest_item_date,
			no_update = EXCLUDED.no_update,
			publish_status = EXCLUDED.publish_status,
			publish_account = EXCLUDED.publish_account
	`,
		feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description),
		NullableString(feed.Language), NullableString(feed.SiteURL),
		feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate),
		NullableString(feed.ETag), NullableString(feed.LastModified),
		feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
		NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD),
		feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate),
		feed.NoUpdate,
		publishStatus, NullableString(feed.PublishAccount),
	)
	return err
}

// getFeed retrieves a feed from PostgreSQL
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
	feed := &Feed{}
	var category, title, description, language, siteURL *string
	var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
	var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
	var publishStatus, publishAccount *string
	var itemCount, noUpdate *int

	err := c.db.QueryRow(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds WHERE url = $1
	`, normalizeURL(feedURL)).Scan(
		&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
		&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
		&etag, &lastModified,
		&feed.Status, &lastError, &lastErrorAt,
		&sourceURL, &domainHost, &domainTLD,
		&itemCount, &oldestItemDate, &newestItemDate,
		&noUpdate,
		&publishStatus, &publishAccount,
	)

	if err == pgx.ErrNoRows {
		return nil, nil
	}
	if err != nil {
		return nil, err
	}

	// Handle nullable fields
	if category != nil {
		feed.Category = *category
	} else {
		feed.Category = "main"
	}
	feed.Title = StringValue(title)
	feed.Description = StringValue(description)
	feed.Language = StringValue(language)
	feed.SiteURL = StringValue(siteURL)
	feed.LastCheckedAt = TimeValue(lastCheckedAt)
	feed.NextCheckAt = TimeValue(nextCheckAt)
	feed.LastBuildDate = TimeValue(lastBuildDate)
	feed.ETag = StringValue(etag)
	feed.LastModified = StringValue(lastModified)
	feed.LastError = StringValue(lastError)
	feed.LastErrorAt = TimeValue(lastErrorAt)
	feed.SourceURL = StringValue(sourceURL)
	feed.DomainHost = StringValue(domainHost)
	feed.DomainTLD = StringValue(domainTLD)
	if itemCount != nil {
		feed.ItemCount = *itemCount
	}
	feed.OldestItemDate = TimeValue(oldestItemDate)
	feed.NewestItemDate = TimeValue(newestItemDate)
	if noUpdate != nil {
		feed.NoUpdate = *noUpdate
	}
	if publishStatus != nil {
		feed.PublishStatus = *publishStatus
	} else {
		feed.PublishStatus = "hold"
	}
	feed.PublishAccount = StringValue(publishAccount)

	return feed, nil
}

// feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool {
	var exists bool
	err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = $1)", normalizeURL(feedURL)).Scan(&exists)
	return err == nil && exists
}

// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds
	`)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}

// GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) {
	var count int
	err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
	return count, err
}

// GetFeedCountByHost returns the number of feeds for a specific host
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
	var count int
	err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count)
	return count, err
}

// GetFeedsDueForCheck returns feeds for feed_check, ordered by last_checked_at ASC (oldest first)
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds
		WHERE last_checked_at > '0001-01-01 00:00:00' AND status = 'pass'
		ORDER BY last_checked_at ASC
		LIMIT $1
	`, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}

// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds WHERE domain_host = $1
	`, host)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}

// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
	tsquery := ToSearchQuery(query)
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds
		WHERE search_vector @@ to_tsquery('english', $1)
		ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC
	`, tsquery)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}

// scanFeeds is a helper to scan multiple feed rows
func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
	var feeds []*Feed

	for rows.Next() {
		feed := &Feed{}
		var feedType, category, title, description, language, siteURL *string
		var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
		var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
		var itemCount, noUpdate *int
		var status *string
		var publishStatus, publishAccount *string

		if err := rows.Scan(
			&feed.URL, &feedType, &category, &title, &description, &language, &siteURL,
			&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
			&etag, &lastModified,
			&status, &lastError, &lastErrorAt,
			&sourceURL, &domainHost, &domainTLD,
			&itemCount, &oldestItemDate, &newestItemDate,
			&noUpdate,
			&publishStatus, &publishAccount,
		); err != nil {
			return nil, err
		}

		// Handle nullable fields
		feed.Type = StringValue(feedType)
		if category != nil && *category != "" {
			feed.Category = *category
		} else {
			feed.Category = "main"
		}
		feed.Title = StringValue(title)
		feed.Description = StringValue(description)
		feed.Language = StringValue(language)
		feed.SiteURL = StringValue(siteURL)
		feed.LastCheckedAt = TimeValue(lastCheckedAt)
		feed.NextCheckAt = TimeValue(nextCheckAt)
		feed.LastBuildDate = TimeValue(lastBuildDate)
		feed.ETag = StringValue(etag)
		feed.LastModified = StringValue(lastModified)
		feed.Status = StringValue(status)
		feed.LastError = StringValue(lastError)
		feed.LastErrorAt = TimeValue(lastErrorAt)
		feed.SourceURL = StringValue(sourceURL)
		feed.DomainHost = StringValue(domainHost)
		feed.DomainTLD = StringValue(domainTLD)
		if itemCount != nil {
			feed.ItemCount = *itemCount
		}
		feed.OldestItemDate = TimeValue(oldestItemDate)
		feed.NewestItemDate = TimeValue(newestItemDate)
		if noUpdate != nil {
			feed.NoUpdate = *noUpdate
		}
		if publishStatus != nil {
			feed.PublishStatus = *publishStatus
		} else {
			feed.PublishStatus = "hold"
		}
		feed.PublishAccount = StringValue(publishAccount)

		feeds = append(feeds, feed)
	}

	return feeds, rows.Err()
}

// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
	feedURL = normalizeURL(feedURL)

	_, err := c.db.Exec(`
		UPDATE feeds SET publish_status = $1, publish_account = $2 WHERE url = $3
	`, status, NullableString(account), feedURL)
	return err
}

// GetFeedsByPublishStatus returns all feeds with a specific publish status
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds
		WHERE publish_status = $1
	`, status)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}

// GetPublishCandidates returns feeds that are hold for review and have items
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
	rows, err := c.db.Query(`
		SELECT url, type, category, title, description, language, site_url,
			discovered_at, last_checked_at, next_check_at, last_build_date,
			etag, last_modified,
			status, last_error, last_error_at,
			source_url, domain_host, domain_tld,
			item_count, oldest_item_date, newest_item_date,
			no_update,
			publish_status, publish_account
		FROM feeds
		WHERE publish_status = 'hold' AND item_count > 0 AND status = 'pass'
		ORDER BY item_count DESC
		LIMIT $1
	`, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanFeeds(rows)
}