Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1134 lines
32 KiB
Go
1134 lines
32 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// classifyFeed determines the category of a feed based on URL patterns
|
|
// Returns: "main", "comments", "category", "author", "article", "podcast"
|
|
// Note: podcast detection is also done in parseRSSMetadata based on content
|
|
func classifyFeed(feedURL string) string {
|
|
lower := strings.ToLower(feedURL)
|
|
|
|
// Comment feeds
|
|
if strings.Contains(lower, "/comment") {
|
|
return "comments"
|
|
}
|
|
|
|
// Podcast URL patterns
|
|
podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
|
|
for _, pattern := range podcastPatterns {
|
|
if strings.Contains(lower, pattern) {
|
|
return "podcast"
|
|
}
|
|
}
|
|
|
|
u, err := url.Parse(feedURL)
|
|
if err != nil {
|
|
return "main"
|
|
}
|
|
|
|
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
|
|
|
// Author feeds
|
|
if strings.Contains(path, "/author/") {
|
|
return "author"
|
|
}
|
|
|
|
// Category/tag feeds
|
|
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
|
|
for _, pattern := range categoryPatterns {
|
|
if strings.Contains(path, pattern) {
|
|
return "category"
|
|
}
|
|
}
|
|
|
|
// Check for article feeds (path ending in /feed with content before it)
|
|
if strings.HasSuffix(path, "/feed") {
|
|
basePath := strings.TrimSuffix(path, "/feed")
|
|
basePath = strings.Trim(basePath, "/")
|
|
|
|
if basePath == "" {
|
|
return "main" // Just /feed - main feed
|
|
}
|
|
|
|
// Article if path contains date patterns
|
|
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
|
return "article"
|
|
}
|
|
|
|
// Article if path has multiple segments (nested content)
|
|
segments := strings.Split(basePath, "/")
|
|
if len(segments) >= 2 {
|
|
return "article"
|
|
}
|
|
|
|
// Article if single segment looks like an article slug
|
|
if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
|
|
return "article"
|
|
}
|
|
}
|
|
|
|
return "main"
|
|
}
|
|
|
|
// classifyFeedByTitle refines category based on feed title (called after parsing)
|
|
func classifyFeedByTitle(title string, currentCategory string) string {
|
|
if currentCategory != "main" {
|
|
return currentCategory // Already classified by URL
|
|
}
|
|
lower := strings.ToLower(title)
|
|
if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
|
|
return "comments"
|
|
}
|
|
return currentCategory
|
|
}
|
|
|
|
// Enclosure represents a media attachment (audio, video, image)
|
|
type Enclosure struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
|
Length int64 `json:"length"` // Size in bytes
|
|
}
|
|
|
|
// Item represents an individual entry/article from a feed
|
|
type Item struct {
|
|
ID int64 `json:"id,omitempty"`
|
|
FeedURL string `json:"feed_url"`
|
|
GUID string `json:"guid,omitempty"`
|
|
Title string `json:"title,omitempty"`
|
|
Link string `json:"link,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Content string `json:"content,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
PubDate time.Time `json:"pub_date,omitempty"`
|
|
DiscoveredAt time.Time `json:"discovered_at"`
|
|
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
|
|
|
// Media attachments
|
|
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
|
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
|
|
|
// Publishing to PDS
|
|
PublishedAt time.Time `json:"published_at,omitempty"`
|
|
PublishedUri string `json:"published_uri,omitempty"`
|
|
}
|
|
|
|
// Feed represents a discovered RSS/Atom feed with metadata
|
|
type Feed struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // "rss", "atom", or "unknown"
|
|
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
|
|
Title string `json:"title,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
|
|
|
|
// Timing
|
|
DiscoveredAt time.Time `json:"discovered_at"`
|
|
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
|
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
|
|
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
|
|
|
// Cache headers for conditional requests
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified string `json:"last_modified,omitempty"`
|
|
|
|
// Feed hints for crawl scheduling
|
|
TTLMinutes int `json:"ttl_minutes,omitempty"` // From RSS <ttl> element
|
|
UpdatePeriod string `json:"update_period,omitempty"` // From sy:updatePeriod (hourly, daily, weekly, monthly, yearly)
|
|
UpdateFreq int `json:"update_freq,omitempty"` // From sy:updateFrequency
|
|
|
|
// Health tracking
|
|
Status string `json:"status"` // "active", "dead", "redirect", "error"
|
|
ErrorCount int `json:"error_count"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
LastErrorAt time.Time `json:"last_error_at,omitempty"`
|
|
|
|
// Discovery source
|
|
SourceURL string `json:"source_url,omitempty"` // Where we found this feed
|
|
SourceHost string `json:"source_host,omitempty"`
|
|
TLD string `json:"tld,omitempty"`
|
|
|
|
// Content stats
|
|
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
|
|
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
|
|
|
// Adaptive check interval
|
|
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
|
|
|
// Publishing to PDS
|
|
PublishStatus string `json:"publish_status"` // "held", "pass", "fail"
|
|
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
|
|
}
|
|
|
|
// saveFeed stores a feed in SQLite
|
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
|
// Default publishStatus to "held" if not set
|
|
publishStatus := feed.PublishStatus
|
|
if publishStatus == "" {
|
|
publishStatus = "held"
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO feeds (
|
|
url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(url) DO UPDATE SET
|
|
type = excluded.type,
|
|
category = excluded.category,
|
|
title = excluded.title,
|
|
description = excluded.description,
|
|
language = excluded.language,
|
|
siteUrl = excluded.siteUrl,
|
|
lastCrawledAt = excluded.lastCrawledAt,
|
|
nextCrawlAt = excluded.nextCrawlAt,
|
|
lastBuildDate = excluded.lastBuildDate,
|
|
etag = excluded.etag,
|
|
lastModified = excluded.lastModified,
|
|
ttlMinutes = excluded.ttlMinutes,
|
|
updatePeriod = excluded.updatePeriod,
|
|
updateFreq = excluded.updateFreq,
|
|
status = excluded.status,
|
|
errorCount = excluded.errorCount,
|
|
lastError = excluded.lastError,
|
|
lastErrorAt = excluded.lastErrorAt,
|
|
itemCount = excluded.itemCount,
|
|
avgPostFreqHrs = excluded.avgPostFreqHrs,
|
|
oldestItemDate = excluded.oldestItemDate,
|
|
newestItemDate = excluded.newestItemDate,
|
|
noUpdate = excluded.noUpdate,
|
|
publishStatus = excluded.publishStatus,
|
|
publishAccount = excluded.publishAccount
|
|
`,
|
|
feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description),
|
|
nullString(feed.Language), nullString(feed.SiteURL),
|
|
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
|
|
nullString(feed.ETag), nullString(feed.LastModified),
|
|
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
|
|
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
|
|
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
|
|
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
|
|
feed.NoUpdate,
|
|
publishStatus, nullString(feed.PublishAccount),
|
|
)
|
|
return err
|
|
}
|
|
|
|
// getFeed retrieves a feed from SQLite
|
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|
feed := &Feed{}
|
|
var category, title, description, language, siteURL sql.NullString
|
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
|
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
|
var avgPostFreqHrs sql.NullFloat64
|
|
var publishStatus, publishAccount sql.NullString
|
|
|
|
err := c.db.QueryRow(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds WHERE url = ?
|
|
`, normalizeURL(feedURL)).Scan(
|
|
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
|
&etag, &lastModified,
|
|
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
|
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
|
&sourceURL, &sourceHost, &tld,
|
|
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
|
&feed.NoUpdate,
|
|
&publishStatus, &publishAccount,
|
|
)
|
|
|
|
if err == sql.ErrNoRows {
|
|
return nil, nil
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Handle nullable fields
|
|
if category.Valid {
|
|
feed.Category = category.String
|
|
} else {
|
|
feed.Category = "main" // Default
|
|
}
|
|
if title.Valid {
|
|
feed.Title = title.String
|
|
}
|
|
if description.Valid {
|
|
feed.Description = description.String
|
|
}
|
|
if language.Valid {
|
|
feed.Language = language.String
|
|
}
|
|
if siteURL.Valid {
|
|
feed.SiteURL = siteURL.String
|
|
}
|
|
if lastCrawledAt.Valid {
|
|
feed.LastCrawledAt = lastCrawledAt.Time
|
|
}
|
|
if nextCrawlAt.Valid {
|
|
feed.NextCrawlAt = nextCrawlAt.Time
|
|
}
|
|
if lastBuildDate.Valid {
|
|
feed.LastBuildDate = lastBuildDate.Time
|
|
}
|
|
if etag.Valid {
|
|
feed.ETag = etag.String
|
|
}
|
|
if lastModified.Valid {
|
|
feed.LastModified = lastModified.String
|
|
}
|
|
if updatePeriod.Valid {
|
|
feed.UpdatePeriod = updatePeriod.String
|
|
}
|
|
if lastError.Valid {
|
|
feed.LastError = lastError.String
|
|
}
|
|
if lastErrorAt.Valid {
|
|
feed.LastErrorAt = lastErrorAt.Time
|
|
}
|
|
if sourceURL.Valid {
|
|
feed.SourceURL = sourceURL.String
|
|
}
|
|
if sourceHost.Valid {
|
|
feed.SourceHost = sourceHost.String
|
|
}
|
|
if tld.Valid {
|
|
feed.TLD = tld.String
|
|
}
|
|
if avgPostFreqHrs.Valid {
|
|
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
|
}
|
|
if oldestItemDate.Valid {
|
|
feed.OldestItemDate = oldestItemDate.Time
|
|
}
|
|
if newestItemDate.Valid {
|
|
feed.NewestItemDate = newestItemDate.Time
|
|
}
|
|
if publishStatus.Valid {
|
|
feed.PublishStatus = publishStatus.String
|
|
} else {
|
|
feed.PublishStatus = "held"
|
|
}
|
|
if publishAccount.Valid {
|
|
feed.PublishAccount = publishAccount.String
|
|
}
|
|
|
|
return feed, nil
|
|
}
|
|
|
|
// feedExists checks if a feed URL already exists in the database
|
|
func (c *Crawler) feedExists(feedURL string) bool {
|
|
var exists bool
|
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
|
|
return err == nil && exists
|
|
}
|
|
|
|
// GetAllFeeds returns all feeds from the database
|
|
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds
|
|
`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetFeedCount returns the total number of feeds in the database
|
|
func (c *Crawler) GetFeedCount() (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
|
|
return count, err
|
|
}
|
|
|
|
// GetFeedCountByHost returns the number of feeds for a specific host
|
|
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
|
|
return count, err
|
|
}
|
|
|
|
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
|
|
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds
|
|
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
|
|
ORDER BY RANDOM()
|
|
LIMIT ?
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetFeedsByHost returns all feeds from a specific host
|
|
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds WHERE sourceHost = ?
|
|
`, host)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// SearchFeeds performs a full-text search on feeds
|
|
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl,
|
|
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
|
|
f.etag, f.lastModified,
|
|
f.ttlMinutes, f.updatePeriod, f.updateFreq,
|
|
f.status, f.errorCount, f.lastError, f.lastErrorAt,
|
|
f.sourceUrl, f.sourceHost, f.tld,
|
|
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
|
|
f.noUpdate,
|
|
f.publishEnabled, f.publishAccount
|
|
FROM feeds f
|
|
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
|
WHERE feeds_fts MATCH ?
|
|
`, query)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// scanFeeds is a helper to scan multiple feed rows
|
|
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
|
var feeds []*Feed
|
|
|
|
for rows.Next() {
|
|
feed := &Feed{}
|
|
var category, title, description, language, siteURL sql.NullString
|
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
|
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
|
var avgPostFreqHrs sql.NullFloat64
|
|
var publishStatus, publishAccount sql.NullString
|
|
|
|
if err := rows.Scan(
|
|
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
|
&etag, &lastModified,
|
|
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
|
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
|
&sourceURL, &sourceHost, &tld,
|
|
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
|
&feed.NoUpdate,
|
|
&publishStatus, &publishAccount,
|
|
); err != nil {
|
|
continue
|
|
}
|
|
|
|
// Handle nullable fields
|
|
if category.Valid {
|
|
feed.Category = category.String
|
|
} else {
|
|
feed.Category = "main"
|
|
}
|
|
if title.Valid {
|
|
feed.Title = title.String
|
|
}
|
|
if description.Valid {
|
|
feed.Description = description.String
|
|
}
|
|
if language.Valid {
|
|
feed.Language = language.String
|
|
}
|
|
if siteURL.Valid {
|
|
feed.SiteURL = siteURL.String
|
|
}
|
|
if lastCrawledAt.Valid {
|
|
feed.LastCrawledAt = lastCrawledAt.Time
|
|
}
|
|
if nextCrawlAt.Valid {
|
|
feed.NextCrawlAt = nextCrawlAt.Time
|
|
}
|
|
if lastBuildDate.Valid {
|
|
feed.LastBuildDate = lastBuildDate.Time
|
|
}
|
|
if etag.Valid {
|
|
feed.ETag = etag.String
|
|
}
|
|
if lastModified.Valid {
|
|
feed.LastModified = lastModified.String
|
|
}
|
|
if updatePeriod.Valid {
|
|
feed.UpdatePeriod = updatePeriod.String
|
|
}
|
|
if lastError.Valid {
|
|
feed.LastError = lastError.String
|
|
}
|
|
if lastErrorAt.Valid {
|
|
feed.LastErrorAt = lastErrorAt.Time
|
|
}
|
|
if sourceURL.Valid {
|
|
feed.SourceURL = sourceURL.String
|
|
}
|
|
if sourceHost.Valid {
|
|
feed.SourceHost = sourceHost.String
|
|
}
|
|
if tld.Valid {
|
|
feed.TLD = tld.String
|
|
}
|
|
if avgPostFreqHrs.Valid {
|
|
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
|
}
|
|
if oldestItemDate.Valid {
|
|
feed.OldestItemDate = oldestItemDate.Time
|
|
}
|
|
if newestItemDate.Valid {
|
|
feed.NewestItemDate = newestItemDate.Time
|
|
}
|
|
if publishStatus.Valid {
|
|
feed.PublishStatus = publishStatus.String
|
|
} else {
|
|
feed.PublishStatus = "held"
|
|
}
|
|
if publishAccount.Valid {
|
|
feed.PublishAccount = publishAccount.String
|
|
}
|
|
|
|
feeds = append(feeds, feed)
|
|
}
|
|
|
|
return feeds, rows.Err()
|
|
}
|
|
|
|
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
|
|
func (c *Crawler) saveItem(item *Item) error {
|
|
// Serialize enclosure fields
|
|
var enclosureUrl, enclosureType sql.NullString
|
|
var enclosureLength sql.NullInt64
|
|
if item.Enclosure != nil {
|
|
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
|
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
|
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
|
}
|
|
|
|
// Serialize imageUrls as JSON
|
|
var imageUrlsJSON sql.NullString
|
|
if len(item.ImageURLs) > 0 {
|
|
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
|
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
|
}
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
|
title = excluded.title,
|
|
link = excluded.link,
|
|
description = excluded.description,
|
|
content = excluded.content,
|
|
author = excluded.author,
|
|
pubDate = excluded.pubDate,
|
|
updatedAt = excluded.updatedAt,
|
|
enclosureUrl = excluded.enclosureUrl,
|
|
enclosureType = excluded.enclosureType,
|
|
enclosureLength = excluded.enclosureLength,
|
|
imageUrls = excluded.imageUrls
|
|
`,
|
|
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
|
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
|
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
|
)
|
|
return err
|
|
}
|
|
|
|
// saveItems stores multiple items efficiently
|
|
func (c *Crawler) saveItems(items []*Item) error {
|
|
if len(items) == 0 {
|
|
return nil
|
|
}
|
|
|
|
tx, err := c.db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer tx.Rollback()
|
|
|
|
stmt, err := tx.Prepare(`
|
|
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
|
title = excluded.title,
|
|
link = excluded.link,
|
|
description = excluded.description,
|
|
content = excluded.content,
|
|
author = excluded.author,
|
|
pubDate = excluded.pubDate,
|
|
updatedAt = excluded.updatedAt,
|
|
enclosureUrl = excluded.enclosureUrl,
|
|
enclosureType = excluded.enclosureType,
|
|
enclosureLength = excluded.enclosureLength,
|
|
imageUrls = excluded.imageUrls
|
|
`)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, item := range items {
|
|
if item == nil || item.GUID == "" {
|
|
continue // Skip nil items or items without GUID
|
|
}
|
|
|
|
// Serialize enclosure fields
|
|
var enclosureUrl, enclosureType sql.NullString
|
|
var enclosureLength sql.NullInt64
|
|
if item.Enclosure != nil {
|
|
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
|
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
|
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
|
}
|
|
|
|
// Serialize imageUrls as JSON
|
|
var imageUrlsJSON sql.NullString
|
|
if len(item.ImageURLs) > 0 {
|
|
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
|
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
|
}
|
|
}
|
|
|
|
_, err := stmt.Exec(
|
|
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
|
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
|
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
|
)
|
|
if err != nil {
|
|
continue // Skip failed items
|
|
}
|
|
}
|
|
|
|
return tx.Commit()
|
|
}
|
|
|
|
// GetItemsByFeed returns all items for a specific feed
|
|
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
|
publishedAt, publishedUri
|
|
FROM items
|
|
WHERE feedUrl = ?
|
|
ORDER BY pubDate DESC
|
|
LIMIT ?
|
|
`, feedURL, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// SearchItems performs a full-text search on items
|
|
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt,
|
|
i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls,
|
|
i.publishedAt, i.publishedUri
|
|
FROM items i
|
|
JOIN items_fts fts ON i.id = fts.rowid
|
|
WHERE items_fts MATCH ?
|
|
ORDER BY i.pubDate DESC
|
|
LIMIT ?
|
|
`, query, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// scanItems is a helper to scan multiple item rows
|
|
func scanItems(rows *sql.Rows) ([]*Item, error) {
|
|
var items []*Item
|
|
for rows.Next() {
|
|
item := &Item{}
|
|
var guid, title, link, description, content, author sql.NullString
|
|
var pubDate, updatedAt, publishedAt sql.NullTime
|
|
var enclosureUrl, enclosureType sql.NullString
|
|
var enclosureLength sql.NullInt64
|
|
var imageUrlsJSON sql.NullString
|
|
var publishedUri sql.NullString
|
|
|
|
if err := rows.Scan(
|
|
&item.ID, &item.FeedURL, &guid, &title, &link,
|
|
&description, &content, &author, &pubDate,
|
|
&item.DiscoveredAt, &updatedAt,
|
|
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
|
|
&publishedAt, &publishedUri,
|
|
); err != nil {
|
|
continue
|
|
}
|
|
|
|
if guid.Valid {
|
|
item.GUID = guid.String
|
|
}
|
|
if title.Valid {
|
|
item.Title = title.String
|
|
}
|
|
if link.Valid {
|
|
item.Link = link.String
|
|
}
|
|
if description.Valid {
|
|
item.Description = description.String
|
|
}
|
|
if content.Valid {
|
|
item.Content = content.String
|
|
}
|
|
if author.Valid {
|
|
item.Author = author.String
|
|
}
|
|
if pubDate.Valid {
|
|
item.PubDate = pubDate.Time
|
|
}
|
|
if updatedAt.Valid {
|
|
item.UpdatedAt = updatedAt.Time
|
|
}
|
|
|
|
// Parse enclosure
|
|
if enclosureUrl.Valid && enclosureUrl.String != "" {
|
|
item.Enclosure = &Enclosure{
|
|
URL: enclosureUrl.String,
|
|
Type: enclosureType.String,
|
|
}
|
|
if enclosureLength.Valid {
|
|
item.Enclosure.Length = enclosureLength.Int64
|
|
}
|
|
}
|
|
|
|
// Parse imageUrls JSON
|
|
if imageUrlsJSON.Valid && imageUrlsJSON.String != "" {
|
|
var urls []string
|
|
if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil {
|
|
item.ImageURLs = urls
|
|
}
|
|
}
|
|
|
|
if publishedAt.Valid {
|
|
item.PublishedAt = publishedAt.Time
|
|
}
|
|
if publishedUri.Valid {
|
|
item.PublishedUri = publishedUri.String
|
|
}
|
|
|
|
items = append(items, item)
|
|
}
|
|
|
|
return items, rows.Err()
|
|
}
|
|
|
|
// CleanupOldItems removes items older than 12 months
|
|
func (c *Crawler) CleanupOldItems() (int64, error) {
|
|
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
|
result, err := c.db.Exec(`
|
|
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
|
|
`, cutoff)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return result.RowsAffected()
|
|
}
|
|
|
|
// processFeed parses and stores a feed with full metadata
|
|
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
|
// Fast path: check without lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
c.feedsMu.Lock()
|
|
defer c.feedsMu.Unlock()
|
|
|
|
// Double-check after acquiring lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
feedType := c.detectFeedType(body)
|
|
now := time.Now()
|
|
|
|
feed := &Feed{
|
|
URL: normalizeURL(feedURL),
|
|
Type: feedType,
|
|
Category: classifyFeed(feedURL),
|
|
DiscoveredAt: now,
|
|
LastCrawledAt: now,
|
|
Status: "active",
|
|
SourceHost: sourceHost,
|
|
TLD: getTLD(sourceHost),
|
|
ETag: headers.Get("ETag"),
|
|
LastModified: headers.Get("Last-Modified"),
|
|
}
|
|
|
|
// Parse feed-specific metadata and items
|
|
var items []*Item
|
|
switch feedType {
|
|
case "rss":
|
|
items = c.parseRSSMetadata(body, feed)
|
|
case "atom":
|
|
items = c.parseAtomMetadata(body, feed)
|
|
}
|
|
|
|
// Refine category based on parsed title (e.g., "Comments on:")
|
|
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
|
|
|
// Calculate next crawl time
|
|
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
|
|
|
if err := c.saveFeed(feed); err != nil {
|
|
return
|
|
}
|
|
|
|
// Save items
|
|
if len(items) > 0 {
|
|
c.saveItems(items)
|
|
}
|
|
}
|
|
|
|
// addFeed adds a discovered feed URL (not yet fetched)
|
|
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
|
// Fast path: check without lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
c.feedsMu.Lock()
|
|
defer c.feedsMu.Unlock()
|
|
|
|
// Double-check after acquiring lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
normalizedURL := normalizeURL(feedURL)
|
|
feed := &Feed{
|
|
URL: normalizedURL,
|
|
Type: feedType,
|
|
Category: classifyFeed(feedURL),
|
|
DiscoveredAt: now,
|
|
Status: "active",
|
|
SourceURL: normalizeURL(sourceURL),
|
|
SourceHost: sourceHost,
|
|
TLD: getTLD(sourceHost),
|
|
NextCrawlAt: now, // Should be crawled immediately
|
|
}
|
|
|
|
if err := c.saveFeed(feed); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// CheckFeed performs a conditional request to check if a feed has been updated
|
|
// Returns: changed (bool), error
|
|
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|
atomic.AddInt32(&c.feedsChecked, 1)
|
|
|
|
// Try different scheme/www combinations since we store URLs without scheme
|
|
urlVariants := []string{
|
|
"https://" + feed.URL,
|
|
"http://" + feed.URL,
|
|
"https://www." + feed.URL,
|
|
"http://www." + feed.URL,
|
|
}
|
|
|
|
var resp *http.Response
|
|
var err error
|
|
var successURL string
|
|
|
|
for _, tryURL := range urlVariants {
|
|
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
|
if reqErr != nil {
|
|
continue
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.UserAgent)
|
|
|
|
// Add conditional headers if we have them
|
|
if feed.ETag != "" {
|
|
req.Header.Set("If-None-Match", feed.ETag)
|
|
}
|
|
if feed.LastModified != "" {
|
|
req.Header.Set("If-Modified-Since", feed.LastModified)
|
|
}
|
|
|
|
resp, err = c.client.Do(req)
|
|
if err == nil {
|
|
successURL = tryURL
|
|
break
|
|
}
|
|
}
|
|
|
|
_ = successURL // May be used later for logging/debugging
|
|
|
|
// If no request succeeded, resp will be nil
|
|
if resp == nil {
|
|
if err == nil {
|
|
err = fmt.Errorf("all URL variants failed")
|
|
}
|
|
now := time.Now()
|
|
feed.LastCrawledAt = now
|
|
feed.ErrorCount++
|
|
feed.NoUpdate++
|
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
|
feed.LastError = err.Error()
|
|
feed.LastErrorAt = now
|
|
feed.Status = "error"
|
|
c.saveFeed(feed)
|
|
return false, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
now := time.Now()
|
|
feed.LastCrawledAt = now
|
|
|
|
// 304 Not Modified - feed hasn't changed
|
|
if resp.StatusCode == http.StatusNotModified {
|
|
feed.NoUpdate++
|
|
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
|
feed.ErrorCount = 0
|
|
feed.LastError = ""
|
|
feed.Status = "active"
|
|
c.saveFeed(feed)
|
|
return false, nil
|
|
}
|
|
|
|
// Non-200 response
|
|
if resp.StatusCode != http.StatusOK {
|
|
feed.ErrorCount++
|
|
feed.NoUpdate++
|
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
|
feed.LastError = resp.Status
|
|
feed.LastErrorAt = now
|
|
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
|
feed.Status = "dead"
|
|
} else {
|
|
feed.Status = "error"
|
|
}
|
|
c.saveFeed(feed)
|
|
return false, nil
|
|
}
|
|
|
|
// 200 OK - feed has new content
|
|
bodyBytes, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
feed.ErrorCount++
|
|
feed.NoUpdate++
|
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
|
feed.LastError = err.Error()
|
|
feed.LastErrorAt = now
|
|
feed.Status = "error"
|
|
c.saveFeed(feed)
|
|
return false, err
|
|
}
|
|
|
|
body := string(bodyBytes)
|
|
|
|
// Update cache headers
|
|
feed.ETag = resp.Header.Get("ETag")
|
|
feed.LastModified = resp.Header.Get("Last-Modified")
|
|
|
|
// Re-detect type and parse metadata
|
|
feedType := c.detectFeedType(body)
|
|
feed.Type = feedType
|
|
|
|
var items []*Item
|
|
switch feedType {
|
|
case "rss":
|
|
items = c.parseRSSMetadata(body, feed)
|
|
case "atom":
|
|
items = c.parseAtomMetadata(body, feed)
|
|
}
|
|
|
|
// Content changed - reset backoff
|
|
feed.NoUpdate = 0
|
|
feed.NextCrawlAt = now.Add(100 * time.Second)
|
|
feed.ErrorCount = 0
|
|
feed.LastError = ""
|
|
feed.Status = "active"
|
|
c.saveFeed(feed)
|
|
|
|
// Save items
|
|
if len(items) > 0 {
|
|
c.saveItems(items)
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail')
|
|
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
|
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
|
feedURL = normalizeURL(feedURL)
|
|
|
|
// Auto-derive account if passing and not provided
|
|
if status == "pass" && account == "" {
|
|
account = DeriveHandleFromFeed(feedURL)
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ?
|
|
`, status, nullString(account), feedURL)
|
|
return err
|
|
}
|
|
|
|
// GetFeedsByPublishStatus returns all feeds with a specific publish status
|
|
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds
|
|
WHERE publishStatus = ?
|
|
`, status)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetPublishCandidates returns feeds that are held review and have items
|
|
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT url, type, category, title, description, language, siteUrl,
|
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
|
etag, lastModified,
|
|
ttlMinutes, updatePeriod, updateFreq,
|
|
status, errorCount, lastError, lastErrorAt,
|
|
sourceUrl, sourceHost, tld,
|
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
|
noUpdate,
|
|
publishStatus, publishAccount
|
|
FROM feeds
|
|
WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active'
|
|
ORDER BY itemCount DESC
|
|
LIMIT ?
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanFeeds(rows)
|
|
}
|
|
|
|
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
|
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
|
publishedAt, publishedUri
|
|
FROM items
|
|
WHERE feedUrl = ? AND publishedAt IS NULL
|
|
ORDER BY pubDate ASC
|
|
LIMIT ?
|
|
`, feedURL, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// MarkItemPublished marks an item as published with the given URI
|
|
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
|
_, err := c.db.Exec(`
|
|
UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ?
|
|
`, uri, itemID)
|
|
return err
|
|
}
|
|
|
|
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
|
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow(`
|
|
SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL
|
|
`, feedURL).Scan(&count)
|
|
return count, err
|
|
}
|