From 75835d771d89064e55d8cdbc575841aee6ad5ae5 Mon Sep 17 00:00:00 2001 From: primal Date: Wed, 28 Jan 2026 15:30:02 -0500 Subject: [PATCH] Add AT Protocol publishing, media support, and SQLite stability Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 --- crawler.go | 63 +- dashboard.go | 1635 ++++++++++++++++++++++++++++++++++++++++-- db.go | 48 +- docker-compose.yml | 1 + domain.go | 22 +- feed.go | 435 ++++++++--- main.go | 28 +- parser.go | 192 ++++- publisher.go | 909 +++++++++++++++++++++++ static/dashboard.css | 28 + static/dashboard.js | 997 +++++++++++++++----------- 11 files changed, 3723 insertions(+), 635 deletions(-) create mode 100644 publisher.go diff --git a/crawler.go b/crawler.go index 1b54c09..bd43d6e 100644 --- a/crawler.go +++ b/crawler.go @@ -61,6 +61,13 @@ func NewCrawler(dbPath string) (*Crawler, error) { func (c *Crawler) Close() error { if c.db != nil { + // Checkpoint WAL to merge it back into main database before closing + // This prevents corruption if the container is stopped mid-write + fmt.Println("Checkpointing WAL...") + if _, err := c.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)"); err != nil { + fmt.Printf("WAL checkpoint warning: %v\n", err) + } + fmt.Println("Closing database...") return c.db.Close() } return nil @@ -87,6 +94,56 @@ func (c *Crawler) StartCleanupLoop() { } } +// StartMaintenanceLoop performs periodic database maintenance +// - WAL checkpoint every 5 minutes to prevent WAL bloat and reduce corruption risk +// - Quick integrity check every hour to detect issues early +// - Hot backup every 24 hours for recovery +func (c *Crawler) StartMaintenanceLoop() { + checkpointTicker := time.NewTicker(5 * time.Minute) + integrityTicker := time.NewTicker(1 * time.Hour) + backupTicker := time.NewTicker(24 * time.Hour) + defer checkpointTicker.Stop() + defer integrityTicker.Stop() + defer backupTicker.Stop() + + for { + select { + case <-checkpointTicker.C: + // Passive checkpoint - doesn't block writers + if _, err := c.db.Exec("PRAGMA wal_checkpoint(PASSIVE)"); err != nil { + fmt.Printf("WAL checkpoint error: %v\n", err) + } + + case <-integrityTicker.C: + // Quick check is faster than full integrity_check + var result string + if err := c.db.QueryRow("PRAGMA quick_check").Scan(&result); err != nil { + fmt.Printf("Integrity check error: %v\n", err) + } else if result != "ok" { + fmt.Printf("WARNING: Database integrity issue detected: %s\n", result) + } + + case <-backupTicker.C: + c.createBackup() + } + } +} + +// createBackup creates a hot backup of the database using SQLite's backup API +func (c *Crawler) createBackup() { + backupPath := "feeds/feeds.db.backup" + fmt.Println("Creating database backup...") + + // Use SQLite's online backup via VACUUM INTO (available in SQLite 3.27+) + // This creates a consistent snapshot without blocking writers + if _, err := c.db.Exec("VACUUM INTO ?", backupPath); err != nil { + fmt.Printf("Backup error: %v\n", err) + return + } + + fmt.Printf("Backup created: %s\n", backupPath) +} + // StartCrawlLoop runs the domain crawling loop independently func (c *Crawler) StartCrawlLoop() { numWorkers := runtime.NumCPU() @@ -113,9 +170,9 @@ func (c *Crawler) StartCrawlLoop() { }() } - const fetchSize = 100 + const fetchSize = 1000 for { - domains, err := c.GetUncheckedDomainsRandom(fetchSize) + domains, err := c.GetUncheckedDomains(fetchSize) if err != nil { fmt.Printf("Error fetching domains: %v\n", err) } @@ -155,7 +212,7 @@ func (c *Crawler) StartCheckLoop() { }() } - const fetchSize = 100 + const fetchSize = 1000 for { feeds, err := c.GetFeedsDueForCheck(fetchSize) if err != nil { diff --git a/dashboard.go b/dashboard.go index 99d1187..83afdea 100644 --- a/dashboard.go +++ b/dashboard.go @@ -251,6 +251,66 @@ func (c *Crawler) StartDashboard(addr string) error { http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) { c.handleAPISearch(w, r) }) + http.HandleFunc("/api/tlds", func(w http.ResponseWriter, r *http.Request) { + c.handleAPITLDs(w, r) + }) + http.HandleFunc("/api/tldDomains", func(w http.ResponseWriter, r *http.Request) { + c.handleAPITLDDomains(w, r) + }) + http.HandleFunc("/api/revisitDomain", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIRevisitDomain(w, r) + }) + http.HandleFunc("/api/priorityCrawl", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIPriorityCrawl(w, r) + }) + http.HandleFunc("/api/checkFeed", func(w http.ResponseWriter, r *http.Request) { + c.handleAPICheckFeed(w, r) + }) + http.HandleFunc("/api/domainsByStatus", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDomainsByStatus(w, r) + }) + http.HandleFunc("/api/feedsByStatus", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFeedsByStatus(w, r) + }) + http.HandleFunc("/api/filter", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFilter(w, r) + }) + http.HandleFunc("/api/enablePublish", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIEnablePublish(w, r) + }) + http.HandleFunc("/api/disablePublish", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDisablePublish(w, r) + }) + http.HandleFunc("/api/publishEnabled", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIPublishEnabled(w, r) + }) + http.HandleFunc("/api/publishCandidates", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIPublishCandidates(w, r) + }) + http.HandleFunc("/api/setPublishStatus", func(w http.ResponseWriter, r *http.Request) { + c.handleAPISetPublishStatus(w, r) + }) + http.HandleFunc("/api/unpublishedItems", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIUnpublishedItems(w, r) + }) + http.HandleFunc("/api/testPublish", func(w http.ResponseWriter, r *http.Request) { + c.handleAPITestPublish(w, r) + }) + http.HandleFunc("/api/deriveHandle", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDeriveHandle(w, r) + }) + http.HandleFunc("/api/publishFeed", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIPublishFeed(w, r) + }) + http.HandleFunc("/api/createAccount", func(w http.ResponseWriter, r *http.Request) { + c.handleAPICreateAccount(w, r) + }) + http.HandleFunc("/api/publishFeedFull", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIPublishFeedFull(w, r) + }) + http.HandleFunc("/api/updateProfile", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIUpdateProfile(w, r) + }) http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) { http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r) }) @@ -301,7 +361,8 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { } rows, err := c.db.Query(` - SELECT url, title, type FROM feeds + SELECT url, title, type, status, errorCount, lastError, itemCount + FROM feeds WHERE sourceHost = ? ORDER BY url ASC LIMIT 1000 @@ -313,21 +374,38 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { defer rows.Close() type FeedInfo struct { - URL string `json:"url"` - Title string `json:"title"` - Type string `json:"type"` + URL string `json:"url"` + Title string `json:"title"` + Type string `json:"type"` + Status string `json:"status,omitempty"` + ErrorCount int `json:"error_count,omitempty"` + LastError string `json:"last_error,omitempty"` + ItemCount int `json:"item_count,omitempty"` } var feeds []FeedInfo for rows.Next() { var f FeedInfo - var title sql.NullString - if err := rows.Scan(&f.URL, &title, &f.Type); err != nil { + var title, status, lastError sql.NullString + var errorCount, itemCount sql.NullInt64 + if err := rows.Scan(&f.URL, &title, &f.Type, &status, &errorCount, &lastError, &itemCount); err != nil { continue } if title.Valid { f.Title = title.String } + if status.Valid { + f.Status = status.String + } + if errorCount.Valid { + f.ErrorCount = int(errorCount.Int64) + } + if lastError.Valid { + f.LastError = lastError.String + } + if itemCount.Valid { + f.ItemCount = int(itemCount.Int64) + } feeds = append(feeds, f) } @@ -483,21 +561,46 @@ type SearchResult struct { } type SearchFeed struct { - URL string `json:"url"` - Title string `json:"title"` - Description string `json:"description"` - Type string `json:"type"` - SourceHost string `json:"source_host"` - Status string `json:"status"` + URL string `json:"url"` + Type string `json:"type"` + Category string `json:"category"` + Title string `json:"title"` + Description string `json:"description"` + Language string `json:"language"` + SiteURL string `json:"site_url"` + DiscoveredAt string `json:"discovered_at"` + LastCrawledAt string `json:"last_crawled_at"` + NextCrawlAt string `json:"next_crawl_at"` + LastBuildDate string `json:"last_build_date"` + TTLMinutes int `json:"ttl_minutes"` + UpdatePeriod string `json:"update_period"` + UpdateFreq int `json:"update_freq"` + Status string `json:"status"` + ErrorCount int `json:"error_count"` + LastError string `json:"last_error"` + LastErrorAt string `json:"last_error_at"` + SourceURL string `json:"source_url"` + SourceHost string `json:"source_host"` + TLD string `json:"tld"` + ItemCount int `json:"item_count"` + AvgPostFreqHrs float64 `json:"avg_post_freq_hrs"` + OldestItemDate string `json:"oldest_item_date"` + NewestItemDate string `json:"newest_item_date"` + NoUpdate bool `json:"no_update"` } type SearchItem struct { - ID int64 `json:"id"` - Title string `json:"title"` - Link string `json:"link"` - Description string `json:"description"` - Author string `json:"author"` - PubDate string `json:"pub_date"` + ID int64 `json:"id"` + FeedURL string `json:"feed_url"` + GUID string `json:"guid"` + Title string `json:"title"` + Link string `json:"link"` + Description string `json:"description"` + Content string `json:"content"` + Author string `json:"author"` + PubDate string `json:"pub_date"` + DiscoveredAt string `json:"discovered_at"` + UpdatedAt string `json:"updated_at"` } func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { @@ -518,9 +621,91 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { // Results map: feedURL -> SearchResult results := make(map[string]*SearchResult) - // Search feeds + // Helper to scan feed row into SearchFeed + scanFeed := func(rows *sql.Rows) (string, SearchFeed, bool) { + var url string + var feedType, category, title, description, language, siteUrl sql.NullString + var discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate sql.NullString + var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64 + var updatePeriod, status, lastError, lastErrorAt sql.NullString + var sourceUrl, sourceHost, tld sql.NullString + var avgPostFreqHrs sql.NullFloat64 + var oldestItemDate, newestItemDate sql.NullString + var noUpdate sql.NullInt64 + + if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl, + &discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &ttlMinutes, &updatePeriod, &updateFreq, + &status, &errorCount, &lastError, &lastErrorAt, + &sourceUrl, &sourceHost, &tld, + &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &noUpdate); err != nil { + return "", SearchFeed{}, false + } + cat := category.String + if cat == "" { + cat = "main" + } + return url, SearchFeed{ + URL: url, + Type: feedType.String, + Category: cat, + Title: title.String, + Description: description.String, + Language: language.String, + SiteURL: siteUrl.String, + DiscoveredAt: discoveredAt.String, + LastCrawledAt: lastCrawledAt.String, + NextCrawlAt: nextCrawlAt.String, + LastBuildDate: lastBuildDate.String, + TTLMinutes: int(ttlMinutes.Int64), + UpdatePeriod: updatePeriod.String, + UpdateFreq: int(updateFreq.Int64), + Status: status.String, + ErrorCount: int(errorCount.Int64), + LastError: lastError.String, + LastErrorAt: lastErrorAt.String, + SourceURL: sourceUrl.String, + SourceHost: sourceHost.String, + TLD: tld.String, + ItemCount: int(itemCount.Int64), + AvgPostFreqHrs: avgPostFreqHrs.Float64, + OldestItemDate: oldestItemDate.String, + NewestItemDate: newestItemDate.String, + NoUpdate: noUpdate.Int64 != 0, + }, true + } + + // Search feeds by sourceHost (LIKE search for domain matching) + hostRows, err := c.db.Query(` + SELECT url, type, category, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, noUpdate + FROM feeds + WHERE sourceHost LIKE ? OR url LIKE ? + LIMIT ? + `, "%"+query+"%", "%"+query+"%", limit) + if err == nil { + defer hostRows.Close() + for hostRows.Next() { + if url, feed, ok := scanFeed(hostRows); ok { + if _, exists := results[url]; !exists { + results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}} + } + } + } + } + + // Search feeds via FTS (title, description, url content) feedRows, err := c.db.Query(` - SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status + SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl, + f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate, + f.ttlMinutes, f.updatePeriod, f.updateFreq, + f.status, f.errorCount, f.lastError, f.lastErrorAt, + f.sourceUrl, f.sourceHost, f.tld, + f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate, f.noUpdate FROM feeds f JOIN feeds_fts fts ON f.rowid = fts.rowid WHERE feeds_fts MATCH ? @@ -529,28 +714,17 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { if err == nil { defer feedRows.Close() for feedRows.Next() { - var url string - var title, description, feedType, sourceHost, status sql.NullString - if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil { - continue - } - results[url] = &SearchResult{ - Feed: SearchFeed{ - URL: url, - Title: title.String, - Description: description.String, - Type: feedType.String, - SourceHost: sourceHost.String, - Status: status.String, - }, - Items: []SearchItem{}, + if url, feed, ok := scanFeed(feedRows); ok { + if _, exists := results[url]; !exists { + results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}} + } } } } // Search items itemRows, err := c.db.Query(` - SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate + SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt FROM items i JOIN items_fts fts ON i.id = fts.rowid WHERE items_fts MATCH ? @@ -562,18 +736,23 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { for itemRows.Next() { var id int64 var feedUrl string - var title, link, description, author, pubDate sql.NullString - if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil { + var guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt sql.NullString + if err := itemRows.Scan(&id, &feedUrl, &guid, &title, &link, &description, &content, &author, &pubDate, &discoveredAt, &updatedAt); err != nil { continue } item := SearchItem{ - ID: id, - Title: title.String, - Link: link.String, - Description: description.String, - Author: author.String, - PubDate: pubDate.String, + ID: id, + FeedURL: feedUrl, + GUID: guid.String, + Title: title.String, + Link: link.String, + Description: description.String, + Content: content.String, + Author: author.String, + PubDate: pubDate.String, + DiscoveredAt: discoveredAt.String, + UpdatedAt: updatedAt.String, } // Add to existing result or create new one @@ -581,20 +760,62 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { result.Items = append(result.Items, item) } else { // Fetch feed info for this item's feed - var fTitle, fDesc, fType, fHost, fStatus sql.NullString - c.db.QueryRow(` - SELECT title, description, type, sourceHost, status - FROM feeds WHERE url = ? - `, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus) + var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl sql.NullString + var fDiscoveredAt, fLastCrawledAt, fNextCrawlAt, fLastBuildDate sql.NullString + var fTTLMinutes, fUpdateFreq, fErrorCount, fItemCount sql.NullInt64 + var fUpdatePeriod, fStatus, fLastError, fLastErrorAt sql.NullString + var fSourceUrl, fSourceHost, fTLD sql.NullString + var fAvgPostFreqHrs sql.NullFloat64 + var fOldestItemDate, fNewestItemDate sql.NullString + var fNoUpdate sql.NullInt64 + c.db.QueryRow(` + SELECT type, category, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, noUpdate + FROM feeds WHERE url = ? + `, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl, + &fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate, + &fTTLMinutes, &fUpdatePeriod, &fUpdateFreq, + &fStatus, &fErrorCount, &fLastError, &fLastErrorAt, + &fSourceUrl, &fSourceHost, &fTLD, + &fItemCount, &fAvgPostFreqHrs, &fOldestItemDate, &fNewestItemDate, &fNoUpdate) + + fCat := fCategory.String + if fCat == "" { + fCat = "main" + } results[feedUrl] = &SearchResult{ Feed: SearchFeed{ - URL: feedUrl, - Title: fTitle.String, - Description: fDesc.String, - Type: fType.String, - SourceHost: fHost.String, - Status: fStatus.String, + URL: feedUrl, + Type: fType.String, + Category: fCat, + Title: fTitle.String, + Description: fDesc.String, + Language: fLang.String, + SiteURL: fSiteUrl.String, + DiscoveredAt: fDiscoveredAt.String, + LastCrawledAt: fLastCrawledAt.String, + NextCrawlAt: fNextCrawlAt.String, + LastBuildDate: fLastBuildDate.String, + TTLMinutes: int(fTTLMinutes.Int64), + UpdatePeriod: fUpdatePeriod.String, + UpdateFreq: int(fUpdateFreq.Int64), + Status: fStatus.String, + ErrorCount: int(fErrorCount.Int64), + LastError: fLastError.String, + LastErrorAt: fLastErrorAt.String, + SourceURL: fSourceUrl.String, + SourceHost: fSourceHost.String, + TLD: fTLD.String, + ItemCount: int(fItemCount.Int64), + AvgPostFreqHrs: fAvgPostFreqHrs.Float64, + OldestItemDate: fOldestItemDate.String, + NewestItemDate: fNewestItemDate.String, + NoUpdate: fNoUpdate.Int64 != 0, }, Items: []SearchItem{item}, } @@ -612,6 +833,1279 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { json.NewEncoder(w).Encode(resultList) } +func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) { + status := r.URL.Query().Get("status") + if status == "" { + http.Error(w, "status parameter required", http.StatusBadRequest) + return + } + + limit := 100 + offset := 0 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + + rows, err := c.db.Query(` + SELECT d.host, d.tld, d.status, d.lastError, COALESCE(f.feed_count, 0) as feed_count + FROM domains d + LEFT JOIN ( + SELECT sourceHost, COUNT(*) as feed_count + FROM feeds + GROUP BY sourceHost + ) f ON d.host = f.sourceHost + WHERE d.status = ? + ORDER BY d.host ASC + LIMIT ? OFFSET ? + `, status, limit, offset) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type DomainInfo struct { + Host string `json:"host"` + TLD string `json:"tld"` + Status string `json:"status"` + LastError string `json:"last_error,omitempty"` + FeedCount int `json:"feed_count"` + } + + var domains []DomainInfo + for rows.Next() { + var d DomainInfo + var tld, lastError sql.NullString + if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil { + continue + } + if tld.Valid { + d.TLD = tld.String + } + if lastError.Valid { + d.LastError = lastError.String + } + domains = append(domains, d) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(domains) +} + +func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) { + status := r.URL.Query().Get("status") + if status == "" { + http.Error(w, "status parameter required", http.StatusBadRequest) + return + } + + limit := 100 + offset := 0 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + + rows, err := c.db.Query(` + SELECT url, title, type, sourceHost, tld, status, errorCount, lastError, itemCount + FROM feeds + WHERE status = ? + ORDER BY url ASC + LIMIT ? OFFSET ? + `, status, limit, offset) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type FeedInfo struct { + URL string `json:"url"` + Title string `json:"title,omitempty"` + Type string `json:"type"` + SourceHost string `json:"source_host"` + TLD string `json:"tld"` + Status string `json:"status"` + ErrorCount int `json:"error_count,omitempty"` + LastError string `json:"last_error,omitempty"` + ItemCount int `json:"item_count,omitempty"` + } + + var feeds []FeedInfo + for rows.Next() { + var f FeedInfo + var title, sourceHost, tld, lastError sql.NullString + var errorCount, itemCount sql.NullInt64 + if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount); err != nil { + continue + } + if title.Valid { + f.Title = title.String + } + if sourceHost.Valid { + f.SourceHost = sourceHost.String + } + if tld.Valid { + f.TLD = tld.String + } + if errorCount.Valid { + f.ErrorCount = int(errorCount.Int64) + } + if lastError.Valid { + f.LastError = lastError.String + } + if itemCount.Valid { + f.ItemCount = int(itemCount.Int64) + } + feeds = append(feeds, f) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(feeds) +} + +func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) { + host := r.URL.Query().Get("host") + if host == "" { + http.Error(w, "host parameter required", http.StatusBadRequest) + return + } + + _, err := c.db.Exec(` + UPDATE domains SET status = 'unchecked', lastError = NULL + WHERE host = ? + `, host) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host}) +} + +// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists) +func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) { + host := r.URL.Query().Get("host") + if host == "" { + http.Error(w, "host parameter required", http.StatusBadRequest) + return + } + + host = normalizeHost(host) + + // Add domain if it doesn't exist, or reset to unchecked + _, err := c.db.Exec(` + INSERT INTO domains (host, status, discoveredAt, tld) + VALUES (?, 'unchecked', datetime('now'), ?) + ON CONFLICT(host) DO UPDATE SET status = 'unchecked', lastError = NULL + `, host, getTLD(host)) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + // Crawl synchronously + fmt.Printf("Priority crawl: %s\n", host) + feedsFound, crawlErr := c.crawlHost(host) + + errStr := "" + if crawlErr != nil { + errStr = crawlErr.Error() + } + + // Mark as crawled + c.markDomainCrawled(host, feedsFound, errStr) + + // Get the feeds we found + feeds, _ := c.GetFeedsByHost(host) + + type FeedSummary struct { + URL string `json:"url"` + Title string `json:"title"` + Type string `json:"type"` + Category string `json:"category"` + Status string `json:"status"` + } + var feedSummaries []FeedSummary + for _, f := range feeds { + feedSummaries = append(feedSummaries, FeedSummary{ + URL: f.URL, + Title: f.Title, + Type: f.Type, + Category: f.Category, + Status: f.Status, + }) + } + + result := map[string]interface{}{ + "host": host, + "feeds_found": feedsFound, + "feeds": feedSummaries, + } + if crawlErr != nil { + result["error"] = crawlErr.Error() + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) +} + +// handleAPICheckFeed immediately checks a feed and returns items +func (c *Crawler) handleAPICheckFeed(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + force := r.URL.Query().Get("force") == "true" + + feedURL = normalizeURL(feedURL) + + // Get the feed + feed, err := c.getFeed(feedURL) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if feed == nil { + http.Error(w, "feed not found", http.StatusNotFound) + return + } + + // Clear cache headers if force is requested + if force { + feed.ETag = "" + feed.LastModified = "" + } + + // Force check the feed + fmt.Printf("Force check feed: %s (force=%v)\n", feedURL, force) + changed, checkErr := c.CheckFeed(feed) + + // Get updated feed info + feed, _ = c.getFeed(feedURL) + + // Get items + items, _ := c.GetItemsByFeed(feedURL, 20) + + type ItemSummary struct { + Title string `json:"title"` + Link string `json:"link"` + PubDate string `json:"pub_date,omitempty"` + Author string `json:"author,omitempty"` + } + var itemSummaries []ItemSummary + for _, item := range items { + is := ItemSummary{ + Title: item.Title, + Link: item.Link, + Author: item.Author, + } + if !item.PubDate.IsZero() { + is.PubDate = item.PubDate.Format("2006-01-02 15:04") + } + itemSummaries = append(itemSummaries, is) + } + + result := map[string]interface{}{ + "url": feedURL, + "title": feed.Title, + "type": feed.Type, + "category": feed.Category, + "status": feed.Status, + "changed": changed, + "itemCount": feed.ItemCount, + "items": itemSummaries, + } + if checkErr != nil { + result["error"] = checkErr.Error() + } + if feed.LastError != "" { + result["lastError"] = feed.LastError + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) +} + +// handleAPIFilter handles flexible filtering with stackable parameters +func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) { + tld := r.URL.Query().Get("tld") + domain := r.URL.Query().Get("domain") + feedStatus := r.URL.Query().Get("feedStatus") + domainStatus := r.URL.Query().Get("domainStatus") + show := r.URL.Query().Get("show") // "feeds" or "domains" + + limit := 100 + offset := 0 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + + // Determine what to show based on filters + if show == "" { + if feedStatus != "" || domain != "" { + show = "feeds" + } else { + show = "domains" + } + } + + if show == "feeds" { + c.filterFeeds(w, tld, domain, feedStatus, limit, offset) + } else { + c.filterDomains(w, tld, domainStatus, limit, offset) + } +} + +func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status string, limit, offset int) { + var args []interface{} + query := ` + SELECT d.host, d.tld, d.status, d.lastError, COALESCE(f.feed_count, 0) as feed_count + FROM domains d + LEFT JOIN ( + SELECT sourceHost, COUNT(*) as feed_count + FROM feeds + GROUP BY sourceHost + ) f ON d.host = f.sourceHost + WHERE 1=1` + + if tld != "" { + query += " AND d.tld = ?" + args = append(args, tld) + } + if status != "" { + query += " AND d.status = ?" + args = append(args, status) + } + + query += " ORDER BY d.host ASC LIMIT ? OFFSET ?" + args = append(args, limit, offset) + + rows, err := c.db.Query(query, args...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type DomainInfo struct { + Host string `json:"host"` + TLD string `json:"tld"` + Status string `json:"status"` + LastError string `json:"last_error,omitempty"` + FeedCount int `json:"feed_count"` + } + + var domains []DomainInfo + for rows.Next() { + var d DomainInfo + var tldVal, lastError sql.NullString + if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil { + continue + } + if tldVal.Valid { + d.TLD = tldVal.String + } + if lastError.Valid { + d.LastError = lastError.String + } + domains = append(domains, d) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "type": "domains", + "data": domains, + }) +} + +func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, limit, offset int) { + var args []interface{} + query := ` + SELECT url, title, type, category, sourceHost, tld, status, errorCount, lastError, itemCount + FROM feeds + WHERE 1=1` + + if tld != "" { + query += " AND tld = ?" + args = append(args, tld) + } + if domain != "" { + query += " AND sourceHost = ?" + args = append(args, domain) + } + if status != "" { + query += " AND status = ?" + args = append(args, status) + } + + query += " ORDER BY url ASC LIMIT ? OFFSET ?" + args = append(args, limit, offset) + + rows, err := c.db.Query(query, args...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type FeedInfo struct { + URL string `json:"url"` + Title string `json:"title,omitempty"` + Type string `json:"type"` + Category string `json:"category"` + SourceHost string `json:"source_host"` + TLD string `json:"tld"` + Status string `json:"status"` + ErrorCount int `json:"error_count,omitempty"` + LastError string `json:"last_error,omitempty"` + ItemCount int `json:"item_count,omitempty"` + } + + var feeds []FeedInfo + for rows.Next() { + var f FeedInfo + var title, category, sourceHost, tldVal, lastError sql.NullString + var errorCount, itemCount sql.NullInt64 + if err := rows.Scan(&f.URL, &title, &f.Type, &category, &sourceHost, &tldVal, &f.Status, &errorCount, &lastError, &itemCount); err != nil { + continue + } + if title.Valid { + f.Title = title.String + } + if category.Valid { + f.Category = category.String + } else { + f.Category = "main" + } + if sourceHost.Valid { + f.SourceHost = sourceHost.String + } + if tldVal.Valid { + f.TLD = tldVal.String + } + if errorCount.Valid { + f.ErrorCount = int(errorCount.Int64) + } + if lastError.Valid { + f.LastError = lastError.String + } + if itemCount.Valid { + f.ItemCount = int(itemCount.Int64) + } + feeds = append(feeds, f) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "type": "feeds", + "data": feeds, + }) +} + +func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { + rows, err := c.db.Query(` + SELECT d.tld, COUNT(DISTINCT d.host) as domain_count, COALESCE(SUM(f.feed_count), 0) as feed_count + FROM domains d + LEFT JOIN ( + SELECT sourceHost, COUNT(*) as feed_count + FROM feeds + GROUP BY sourceHost + ) f ON d.host = f.sourceHost + WHERE d.tld IS NOT NULL AND d.tld != '' + GROUP BY d.tld + ORDER BY d.tld ASC + `) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type TLDInfo struct { + TLD string `json:"tld"` + DomainCount int `json:"domain_count"` + FeedCount int `json:"feed_count"` + } + + var tlds []TLDInfo + for rows.Next() { + var t TLDInfo + if err := rows.Scan(&t.TLD, &t.DomainCount, &t.FeedCount); err != nil { + continue + } + tlds = append(tlds, t) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(tlds) +} + +func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) { + tld := r.URL.Query().Get("tld") + if tld == "" { + http.Error(w, "tld parameter required", http.StatusBadRequest) + return + } + + limit := 100 + offset := 0 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + + rows, err := c.db.Query(` + SELECT d.host, d.status, d.lastError, COALESCE(f.feed_count, 0) as feed_count + FROM domains d + LEFT JOIN ( + SELECT sourceHost, COUNT(*) as feed_count + FROM feeds + GROUP BY sourceHost + ) f ON d.host = f.sourceHost + WHERE d.tld = ? + ORDER BY d.host ASC + LIMIT ? OFFSET ? + `, tld, limit, offset) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type DomainInfo struct { + Host string `json:"host"` + Status string `json:"status"` + LastError string `json:"last_error,omitempty"` + FeedCount int `json:"feed_count"` + } + + var domains []DomainInfo + for rows.Next() { + var d DomainInfo + var lastError sql.NullString + if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil { + continue + } + if lastError.Valid { + d.LastError = lastError.String + } + domains = append(domains, d) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(domains) +} + +// handleAPIEnablePublish sets a feed's publish status to 'pass' +// If account is not provided, it will be auto-derived from the feed URL +func (c *Crawler) handleAPIEnablePublish(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + account := r.URL.Query().Get("account") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + feedURL = normalizeURL(feedURL) + + // Auto-derive account handle if not provided + if account == "" { + account = DeriveHandleFromFeed(feedURL) + if account == "" { + http.Error(w, "could not derive account handle from URL", http.StatusBadRequest) + return + } + } + + // Check feed exists + feed, err := c.getFeed(feedURL) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if feed == nil { + http.Error(w, "feed not found", http.StatusNotFound) + return + } + + if err := c.SetPublishStatus(feedURL, "pass", account); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + // Get unpublished count + count, _ := c.GetUnpublishedItemCount(feedURL) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "pass", + "url": feedURL, + "account": account, + "unpublished_items": count, + }) +} + +// handleAPIDeriveHandle shows what handle would be derived from a feed URL +func (c *Crawler) handleAPIDeriveHandle(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + handle := DeriveHandleFromFeed(feedURL) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "url": feedURL, + "handle": handle, + }) +} + +// handleAPIDisablePublish sets a feed's publish status to 'fail' +func (c *Crawler) handleAPIDisablePublish(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + feedURL = normalizeURL(feedURL) + + if err := c.SetPublishStatus(feedURL, "fail", ""); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "fail", + "url": feedURL, + }) +} + +// handleAPIPublishEnabled returns all feeds with publish status 'pass' +func (c *Crawler) handleAPIPublishEnabled(w http.ResponseWriter, r *http.Request) { + feeds, err := c.GetFeedsByPublishStatus("pass") + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + type FeedPublishInfo struct { + URL string `json:"url"` + Title string `json:"title"` + Account string `json:"account"` + UnpublishedCount int `json:"unpublished_count"` + } + + var result []FeedPublishInfo + for _, f := range feeds { + count, _ := c.GetUnpublishedItemCount(f.URL) + result = append(result, FeedPublishInfo{ + URL: f.URL, + Title: f.Title, + Account: f.PublishAccount, + UnpublishedCount: count, + }) + } + + if result == nil { + result = []FeedPublishInfo{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) +} + +// handleAPIPublishCandidates returns feeds pending review that have items +func (c *Crawler) handleAPIPublishCandidates(w http.ResponseWriter, r *http.Request) { + limit := 50 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 200 { + limit = 200 + } + } + + feeds, err := c.GetPublishCandidates(limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + type CandidateInfo struct { + URL string `json:"url"` + Title string `json:"title"` + Category string `json:"category"` + SourceHost string `json:"source_host"` + ItemCount int `json:"item_count"` + DerivedHandle string `json:"derived_handle"` + } + + var result []CandidateInfo + for _, f := range feeds { + result = append(result, CandidateInfo{ + URL: f.URL, + Title: f.Title, + Category: f.Category, + SourceHost: f.SourceHost, + ItemCount: f.ItemCount, + DerivedHandle: DeriveHandleFromFeed(f.URL), + }) + } + + if result == nil { + result = []CandidateInfo{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) +} + +// handleAPISetPublishStatus sets the publish status for a feed +// status must be 'pass', 'fail', or 'pending' +func (c *Crawler) handleAPISetPublishStatus(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + status := r.URL.Query().Get("status") + account := r.URL.Query().Get("account") + + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + if status != "pass" && status != "fail" && status != "held" { + http.Error(w, "status must be 'pass', 'fail', or 'held'", http.StatusBadRequest) + return + } + + feedURL = normalizeURL(feedURL) + + // Auto-derive account for 'pass' if not provided + if status == "pass" && account == "" { + account = DeriveHandleFromFeed(feedURL) + } + + if err := c.SetPublishStatus(feedURL, status, account); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "url": feedURL, + "status": status, + "account": account, + }) +} + +// handleAPIUnpublishedItems returns unpublished items for a feed +func (c *Crawler) handleAPIUnpublishedItems(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + limit := 50 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 200 { + limit = 200 + } + } + + items, err := c.GetUnpublishedItems(feedURL, limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if items == nil { + items = []*Item{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(items) +} + +// handleAPITestPublish tests publishing a single item to PDS +// Requires: url (feed), itemId, handle, password, pds (optional, defaults to https://1440.news) +func (c *Crawler) handleAPITestPublish(w http.ResponseWriter, r *http.Request) { + itemIDStr := r.URL.Query().Get("itemId") + handle := r.URL.Query().Get("handle") + password := r.URL.Query().Get("password") + pdsHost := r.URL.Query().Get("pds") + + if itemIDStr == "" { + http.Error(w, "itemId parameter required", http.StatusBadRequest) + return + } + if handle == "" || password == "" { + http.Error(w, "handle and password parameters required", http.StatusBadRequest) + return + } + if pdsHost == "" { + pdsHost = "https://1440.news" + } + + var itemID int64 + fmt.Sscanf(itemIDStr, "%d", &itemID) + + // Get the item + var item Item + var guid, title, link, description, content, author sql.NullString + var pubDate, updatedAt, publishedAt sql.NullTime + var publishedUri sql.NullString + + err := c.db.QueryRow(` + SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt, publishedAt, publishedUri + FROM items WHERE id = ? + `, itemID).Scan( + &item.ID, &item.FeedURL, &guid, &title, &link, + &description, &content, &author, &pubDate, + &item.DiscoveredAt, &updatedAt, &publishedAt, &publishedUri, + ) + if err != nil { + http.Error(w, "item not found: "+err.Error(), http.StatusNotFound) + return + } + + if guid.Valid { + item.GUID = guid.String + } + if title.Valid { + item.Title = title.String + } + if link.Valid { + item.Link = link.String + } + if description.Valid { + item.Description = description.String + } + if content.Valid { + item.Content = content.String + } + if author.Valid { + item.Author = author.String + } + if pubDate.Valid { + item.PubDate = pubDate.Time + } + + // Create publisher and authenticate + publisher := NewPublisher(pdsHost) + session, err := publisher.CreateSession(handle, password) + if err != nil { + http.Error(w, "auth failed: "+err.Error(), http.StatusUnauthorized) + return + } + + // Publish the item + uri, err := publisher.PublishItem(session, &item) + if err != nil { + http.Error(w, "publish failed: "+err.Error(), http.StatusInternalServerError) + return + } + + // Mark as published + c.MarkItemPublished(item.ID, uri) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "published", + "uri": uri, + "itemId": item.ID, + "title": item.Title, + "rkey": GenerateRkey(item.GUID, item.DiscoveredAt), + }) +} + +// handleAPIPublishFeed publishes unpublished items for a feed +// Requires: url (feed), handle, password, pds (optional), limit (optional, default 10) +func (c *Crawler) handleAPIPublishFeed(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + handle := r.URL.Query().Get("handle") + password := r.URL.Query().Get("password") + pdsHost := r.URL.Query().Get("pds") + + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + if handle == "" || password == "" { + http.Error(w, "handle and password parameters required", http.StatusBadRequest) + return + } + if pdsHost == "" { + pdsHost = "https://1440.news" + } + + limit := 10 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 50 { + limit = 50 + } + } + + feedURL = normalizeURL(feedURL) + + // Get unpublished items (ordered by pubDate ASC - oldest first) + items, err := c.GetUnpublishedItems(feedURL, limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if len(items) == 0 { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "no_items", + "published": 0, + }) + return + } + + // Create publisher and authenticate + publisher := NewPublisher(pdsHost) + session, err := publisher.CreateSession(handle, password) + if err != nil { + http.Error(w, "auth failed: "+err.Error(), http.StatusUnauthorized) + return + } + + type PublishResult struct { + ItemID int64 `json:"item_id"` + Title string `json:"title"` + URI string `json:"uri,omitempty"` + Error string `json:"error,omitempty"` + } + + var results []PublishResult + published := 0 + failed := 0 + + for _, item := range items { + result := PublishResult{ + ItemID: item.ID, + Title: item.Title, + } + + uri, err := publisher.PublishItem(session, item) + if err != nil { + result.Error = err.Error() + failed++ + } else { + result.URI = uri + c.MarkItemPublished(item.ID, uri) + published++ + } + + results = append(results, result) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "complete", + "published": published, + "failed": failed, + "results": results, + }) +} + +// handleAPICreateAccount creates a new account on the PDS +// Requires: handle, email, password, pds (optional), inviteCode (optional) +// If pdsAdminPassword is provided, it will create an invite code first +func (c *Crawler) handleAPICreateAccount(w http.ResponseWriter, r *http.Request) { + handle := r.URL.Query().Get("handle") + email := r.URL.Query().Get("email") + password := r.URL.Query().Get("password") + pdsHost := r.URL.Query().Get("pds") + inviteCode := r.URL.Query().Get("inviteCode") + pdsAdminPassword := r.URL.Query().Get("pdsAdminPassword") + + if handle == "" || password == "" { + http.Error(w, "handle and password parameters required", http.StatusBadRequest) + return + } + if pdsHost == "" { + pdsHost = "https://pds.1440.news" + } + if email == "" { + // Generate a placeholder email from handle + email = handle + "@1440.news" + } + + publisher := NewPublisher(pdsHost) + + // If PDS admin password provided, create an invite code first + if pdsAdminPassword != "" && inviteCode == "" { + code, err := publisher.CreateInviteCode(pdsAdminPassword, 1) + if err != nil { + http.Error(w, "create invite failed: "+err.Error(), http.StatusInternalServerError) + return + } + inviteCode = code + } + + // Create the account + session, err := publisher.CreateAccount(handle, email, password, inviteCode) + if err != nil { + http.Error(w, "create account failed: "+err.Error(), http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "created", + "handle": session.Handle, + "did": session.DID, + }) +} + +// handleAPIPublishFeedFull creates an account (if needed) and publishes items +// This is a convenience endpoint that combines account creation and publishing +// Requires: url (feed), pdsAdminPassword, pds (optional), limit (optional), feedPassword (optional) +func (c *Crawler) handleAPIPublishFeedFull(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + pdsAdminPassword := r.URL.Query().Get("pdsAdminPassword") + pdsHost := r.URL.Query().Get("pds") + feedPassword := r.URL.Query().Get("feedPassword") // Password for new feed accounts + + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + if pdsAdminPassword == "" { + http.Error(w, "pdsAdminPassword parameter required", http.StatusBadRequest) + return + } + if pdsHost == "" { + pdsHost = "https://pds.1440.news" + } + if feedPassword == "" { + feedPassword = "feed1440!" // Default password for feed accounts + } + + limit := 10 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 50 { + limit = 50 + } + } + + feedURL = normalizeURL(feedURL) + + // Get the feed to check its status and get the derived handle + feed, err := c.getFeed(feedURL) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if feed == nil { + http.Error(w, "feed not found", http.StatusNotFound) + return + } + if feed.PublishStatus != "pass" { + http.Error(w, "feed is not approved for publishing (status: "+feed.PublishStatus+")", http.StatusBadRequest) + return + } + + handle := feed.PublishAccount + if handle == "" { + handle = DeriveHandleFromFeed(feedURL) + } + email := handle + "@1440.news" + + publisher := NewPublisher(pdsHost) + + // First, try to authenticate with the feed account + session, err := publisher.CreateSession(handle, feedPassword) + if err != nil { + // Account doesn't exist, create it + fmt.Printf("Account %s doesn't exist, creating...\n", handle) + + // Create invite code using PDS admin password + inviteCode, err := publisher.CreateInviteCode(pdsAdminPassword, 1) + if err != nil { + http.Error(w, "create invite failed: "+err.Error(), http.StatusInternalServerError) + return + } + + // Create the account + session, err = publisher.CreateAccount(handle, email, feedPassword, inviteCode) + if err != nil { + http.Error(w, "create account failed: "+err.Error(), http.StatusInternalServerError) + return + } + fmt.Printf("Created account: %s (%s)\n", session.Handle, session.DID) + + // Set up profile with feed title and favicon + displayName := feed.Title + if displayName == "" { + displayName = feed.SourceHost + } + description := feed.Description + + // Try to fetch favicon for avatar + var avatar *BlobRef + faviconData, mimeType, err := FetchFavicon(feed.SourceHost) + if err == nil && len(faviconData) > 0 { + avatar, err = publisher.UploadBlob(session, faviconData, mimeType) + if err != nil { + fmt.Printf("Failed to upload favicon: %v\n", err) + } + } + + if err := publisher.UpdateProfile(session, displayName, description, avatar); err != nil { + fmt.Printf("Failed to update profile: %v\n", err) + } else { + fmt.Printf("Set profile for %s: %s\n", handle, displayName) + } + } + + // Get unpublished items + items, err := c.GetUnpublishedItems(feedURL, limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if len(items) == 0 { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "no_items", + "handle": handle, + "published": 0, + }) + return + } + + type PublishResult struct { + ItemID int64 `json:"item_id"` + Title string `json:"title"` + URI string `json:"uri,omitempty"` + Error string `json:"error,omitempty"` + } + + var results []PublishResult + published := 0 + failed := 0 + + for _, item := range items { + result := PublishResult{ + ItemID: item.ID, + Title: item.Title, + } + + uri, err := publisher.PublishItem(session, item) + if err != nil { + result.Error = err.Error() + failed++ + } else { + result.URI = uri + c.MarkItemPublished(item.ID, uri) + published++ + } + + results = append(results, result) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "complete", + "handle": handle, + "did": session.DID, + "published": published, + "failed": failed, + "results": results, + }) +} + +// handleAPIUpdateProfile updates a profile for an existing account +// Requires: handle, password, pds (optional), displayName (optional), description (optional), faviconUrl (optional) +func (c *Crawler) handleAPIUpdateProfile(w http.ResponseWriter, r *http.Request) { + handle := r.URL.Query().Get("handle") + password := r.URL.Query().Get("password") + pdsHost := r.URL.Query().Get("pds") + displayName := r.URL.Query().Get("displayName") + description := r.URL.Query().Get("description") + faviconURL := r.URL.Query().Get("faviconUrl") + + if handle == "" || password == "" { + http.Error(w, "handle and password parameters required", http.StatusBadRequest) + return + } + if pdsHost == "" { + pdsHost = "https://pds.1440.news" + } + + publisher := NewPublisher(pdsHost) + + // Authenticate + session, err := publisher.CreateSession(handle, password) + if err != nil { + http.Error(w, "auth failed: "+err.Error(), http.StatusUnauthorized) + return + } + + // Fetch favicon if URL provided + var avatar *BlobRef + if faviconURL != "" { + faviconData, mimeType, err := FetchFavicon(faviconURL) + if err != nil { + http.Error(w, "fetch favicon failed: "+err.Error(), http.StatusBadRequest) + return + } + avatar, err = publisher.UploadBlob(session, faviconData, mimeType) + if err != nil { + http.Error(w, "upload favicon failed: "+err.Error(), http.StatusInternalServerError) + return + } + } + + // Update profile + if err := publisher.UpdateProfile(session, displayName, description, avatar); err != nil { + http.Error(w, "update profile failed: "+err.Error(), http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]interface{}{ + "status": "updated", + "handle": handle, + "displayName": displayName, + "hasAvatar": avatar != nil, + }) +} + func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { stats, err := c.GetDashboardStats() if err != nil { @@ -672,7 +2166,7 @@ const dashboardHTML = ` 1440.news Feed Crawler - +

1440.news Feed Crawler

@@ -724,17 +2218,26 @@ const dashboardHTML = ` -
-

Feeds

-
- -
- -
-
-
Loading...
+
+
+ + + | + + + + | + + +
+ +
+ +
+ +
Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}
diff --git a/db.go b/db.go index 0e49fd2..03d48cb 100644 --- a/db.go +++ b/db.go @@ -3,6 +3,7 @@ package main import ( "database/sql" "fmt" + "time" _ "modernc.org/sqlite" ) @@ -25,6 +26,7 @@ CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WH CREATE TABLE IF NOT EXISTS feeds ( url TEXT PRIMARY KEY, type TEXT, + category TEXT DEFAULT 'main', title TEXT, description TEXT, language TEXT, @@ -56,14 +58,20 @@ CREATE TABLE IF NOT EXISTS feeds ( oldestItemDate DATETIME, newestItemDate DATETIME, - noUpdate INTEGER DEFAULT 0 + noUpdate INTEGER DEFAULT 0, + + -- Publishing to PDS + publishStatus TEXT DEFAULT 'held' CHECK(publishStatus IN ('held', 'pass', 'fail')), + publishAccount TEXT ); CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost); +CREATE INDEX IF NOT EXISTS idx_feeds_publishStatus ON feeds(publishStatus); CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url); CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld); CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost); CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type); +CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category); CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt); CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); @@ -80,6 +88,17 @@ CREATE TABLE IF NOT EXISTS items ( pubDate DATETIME, discoveredAt DATETIME NOT NULL, updatedAt DATETIME, + + -- Media attachments + enclosureUrl TEXT, + enclosureType TEXT, + enclosureLength INTEGER, + imageUrls TEXT, -- JSON array of image URLs + + -- Publishing to PDS + publishedAt DATETIME, + publishedUri TEXT, + UNIQUE(feedUrl, guid) ); @@ -87,6 +106,7 @@ CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl); CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC); CREATE INDEX IF NOT EXISTS idx_items_link ON items(link); CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC); +CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feedUrl, publishedAt) WHERE publishedAt IS NULL; -- Full-text search for feeds CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5( @@ -148,15 +168,22 @@ func OpenDatabase(dbPath string) (*sql.DB, error) { fmt.Printf("Opening database: %s\n", dbPath) // Use pragmas in connection string for consistent application - connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)" + // - busy_timeout: wait up to 10s for locks instead of failing immediately + // - journal_mode: WAL for better concurrency and crash recovery + // - synchronous: NORMAL is safe with WAL (fsync at checkpoint, not every commit) + // - wal_autocheckpoint: checkpoint every 1000 pages (~4MB) to prevent WAL bloat + // - foreign_keys: enforce referential integrity + connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=wal_autocheckpoint(1000)&_pragma=foreign_keys(ON)" db, err := sql.Open("sqlite", connStr) if err != nil { return nil, fmt.Errorf("failed to open database: %v", err) } - // Allow multiple readers (WAL mode supports concurrent reads) - // SQLite is single-writer, but reads can happen concurrently - db.SetMaxOpenConns(4) + // Connection pool settings for stability + db.SetMaxOpenConns(4) // Limit concurrent connections + db.SetMaxIdleConns(2) // Keep some connections warm + db.SetConnMaxLifetime(5 * time.Minute) // Recycle connections periodically + db.SetConnMaxIdleTime(1 * time.Minute) // Close idle connections // Verify connection and show journal mode var journalMode string @@ -173,6 +200,17 @@ func OpenDatabase(dbPath string) (*sql.DB, error) { } fmt.Println(" Schema OK") + // Migrations for existing databases + migrations := []string{ + "ALTER TABLE items ADD COLUMN enclosureUrl TEXT", + "ALTER TABLE items ADD COLUMN enclosureType TEXT", + "ALTER TABLE items ADD COLUMN enclosureLength INTEGER", + "ALTER TABLE items ADD COLUMN imageUrls TEXT", + } + for _, m := range migrations { + db.Exec(m) // Ignore errors (column may already exist) + } + // Run stats and ANALYZE in background to avoid blocking startup with large databases go func() { var domainCount, feedCount int diff --git a/docker-compose.yml b/docker-compose.yml index 7f53c81..c067351 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,7 @@ services: build: . container_name: app-1440-news restart: unless-stopped + stop_grace_period: 30s env_file: - pds.env volumes: diff --git a/domain.go b/domain.go index d4197fb..86186eb 100644 --- a/domain.go +++ b/domain.go @@ -88,26 +88,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) { return domain, nil } -// GetUncheckedDomains returns all domains with status "unchecked" -func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) { +// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO) +func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) { rows, err := c.db.Query(` SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld FROM domains WHERE status = 'unchecked' - `) - if err != nil { - return nil, err - } - defer rows.Close() - - return c.scanDomains(rows) -} - -// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order -func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) { - rows, err := c.db.Query(` - SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld - FROM domains WHERE status = 'unchecked' - ORDER BY RANDOM() + ORDER BY discoveredAt ASC LIMIT ? `, limit) if err != nil { @@ -224,7 +210,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { buf := make([]byte, 0, 64*1024) scanner.Buffer(buf, 1024*1024) - const batchSize = 10000 + const batchSize = 1000 now := time.Now() nowStr := now.Format("2006-01-02 15:04:05") totalImported := 0 diff --git a/feed.go b/feed.go index cb83d74..697be82 100644 --- a/feed.go +++ b/feed.go @@ -2,6 +2,7 @@ package main import ( "database/sql" + "encoding/json" "fmt" "io" "net/http" @@ -12,58 +13,91 @@ import ( "time" ) -// shouldSkipFeed checks if a feed URL should be filtered out -// Returns true (and a reason) if the feed should be skipped -func shouldSkipFeed(feedURL string) (bool, string) { +// classifyFeed determines the category of a feed based on URL patterns +// Returns: "main", "comments", "category", "author", "article", "podcast" +// Note: podcast detection is also done in parseRSSMetadata based on content +func classifyFeed(feedURL string) string { lower := strings.ToLower(feedURL) - // Skip explicit comment feeds + // Comment feeds if strings.Contains(lower, "/comment") { - return true, "comment feed" + return "comments" + } + + // Podcast URL patterns + podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"} + for _, pattern := range podcastPatterns { + if strings.Contains(lower, pattern) { + return "podcast" + } } u, err := url.Parse(feedURL) if err != nil { - return false, "" + return "main" } path := strings.ToLower(strings.TrimSuffix(u.Path, "/")) - // Skip category/tag feeds - categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"} + // Author feeds + if strings.Contains(path, "/author/") { + return "author" + } + + // Category/tag feeds + categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"} for _, pattern := range categoryPatterns { if strings.Contains(path, pattern) { - return true, "category/tag feed" + return "category" } } - // Check for article comment feeds (path ending in /feed with content before it) + // Check for article feeds (path ending in /feed with content before it) if strings.HasSuffix(path, "/feed") { basePath := strings.TrimSuffix(path, "/feed") basePath = strings.Trim(basePath, "/") if basePath == "" { - return false, "" // Just /feed - legitimate main feed + return "main" // Just /feed - main feed } - // Skip if path contains date patterns (likely article) + // Article if path contains date patterns if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched { - return true, "article feed (date pattern)" + return "article" } - // Skip if path has multiple segments (likely article or nested content) + // Article if path has multiple segments (nested content) segments := strings.Split(basePath, "/") if len(segments) >= 2 { - return true, "article feed (nested path)" + return "article" } - // Skip if single segment looks like an article slug (contains hyphens, is long) - if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) { - return true, "article feed (slug pattern)" + // Article if single segment looks like an article slug + if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 { + return "article" } } - return false, "" + return "main" +} + +// classifyFeedByTitle refines category based on feed title (called after parsing) +func classifyFeedByTitle(title string, currentCategory string) string { + if currentCategory != "main" { + return currentCategory // Already classified by URL + } + lower := strings.ToLower(title) + if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") { + return "comments" + } + return currentCategory +} + +// Enclosure represents a media attachment (audio, video, image) +type Enclosure struct { + URL string `json:"url"` + Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.) + Length int64 `json:"length"` // Size in bytes } // Item represents an individual entry/article from a feed @@ -79,12 +113,21 @@ type Item struct { PubDate time.Time `json:"pub_date,omitempty"` DiscoveredAt time.Time `json:"discovered_at"` UpdatedAt time.Time `json:"updated_at,omitempty"` + + // Media attachments + Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.) + ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content + + // Publishing to PDS + PublishedAt time.Time `json:"published_at,omitempty"` + PublishedUri string `json:"published_uri,omitempty"` } // Feed represents a discovered RSS/Atom feed with metadata type Feed struct { URL string `json:"url"` - Type string `json:"type"` // "rss", "atom", or "unknown" + Type string `json:"type"` // "rss", "atom", or "unknown" + Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast" Title string `json:"title,omitempty"` Description string `json:"description,omitempty"` Language string `json:"language,omitempty"` @@ -124,23 +167,35 @@ type Feed struct { // Adaptive check interval NoUpdate int `json:"no_update"` // Consecutive checks with no change + + // Publishing to PDS + PublishStatus string `json:"publish_status"` // "held", "pass", "fail" + PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news" } // saveFeed stores a feed in SQLite func (c *Crawler) saveFeed(feed *Feed) error { + // Default publishStatus to "held" if not set + publishStatus := feed.PublishStatus + if publishStatus == "" { + publishStatus = "held" + } + _, err := c.db.Exec(` INSERT INTO feeds ( - url, type, title, description, language, siteUrl, + url, type, category, title, description, language, siteUrl, discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, etag, lastModified, ttlMinutes, updatePeriod, updateFreq, status, errorCount, lastError, lastErrorAt, sourceUrl, sourceHost, tld, itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, - noUpdate - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + noUpdate, + publishStatus, publishAccount + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET type = excluded.type, + category = excluded.category, title = excluded.title, description = excluded.description, language = excluded.language, @@ -161,9 +216,11 @@ func (c *Crawler) saveFeed(feed *Feed) error { avgPostFreqHrs = excluded.avgPostFreqHrs, oldestItemDate = excluded.oldestItemDate, newestItemDate = excluded.newestItemDate, - noUpdate = excluded.noUpdate + noUpdate = excluded.noUpdate, + publishStatus = excluded.publishStatus, + publishAccount = excluded.publishAccount `, - feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description), + feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description), nullString(feed.Language), nullString(feed.SiteURL), feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate), nullString(feed.ETag), nullString(feed.LastModified), @@ -172,6 +229,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD), feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate), feed.NoUpdate, + publishStatus, nullString(feed.PublishAccount), ) return err } @@ -179,23 +237,25 @@ func (c *Crawler) saveFeed(feed *Feed) error { // getFeed retrieves a feed from SQLite func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed := &Feed{} - var title, description, language, siteURL sql.NullString + var category, title, description, language, siteURL sql.NullString var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString var avgPostFreqHrs sql.NullFloat64 + var publishStatus, publishAccount sql.NullString err := c.db.QueryRow(` - SELECT url, type, title, description, language, siteUrl, + SELECT url, type, category, title, description, language, siteUrl, discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, etag, lastModified, ttlMinutes, updatePeriod, updateFreq, status, errorCount, lastError, lastErrorAt, sourceUrl, sourceHost, tld, itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, - noUpdate + noUpdate, + publishStatus, publishAccount FROM feeds WHERE url = ? `, normalizeURL(feedURL)).Scan( - &feed.URL, &feed.Type, &title, &description, &language, &siteURL, + &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, &etag, &lastModified, &feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq, @@ -203,6 +263,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { &sourceURL, &sourceHost, &tld, &feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &feed.NoUpdate, + &publishStatus, &publishAccount, ) if err == sql.ErrNoRows { @@ -213,6 +274,11 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { } // Handle nullable fields + if category.Valid { + feed.Category = category.String + } else { + feed.Category = "main" // Default + } if title.Valid { feed.Title = title.String } @@ -267,6 +333,14 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { if newestItemDate.Valid { feed.NewestItemDate = newestItemDate.Time } + if publishStatus.Valid { + feed.PublishStatus = publishStatus.String + } else { + feed.PublishStatus = "held" + } + if publishAccount.Valid { + feed.PublishAccount = publishAccount.String + } return feed, nil } @@ -281,14 +355,15 @@ func (c *Crawler) feedExists(feedURL string) bool { // GetAllFeeds returns all feeds from the database func (c *Crawler) GetAllFeeds() ([]*Feed, error) { rows, err := c.db.Query(` - SELECT url, type, title, description, language, siteUrl, + SELECT url, type, category, title, description, language, siteUrl, discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, etag, lastModified, ttlMinutes, updatePeriod, updateFreq, status, errorCount, lastError, lastErrorAt, sourceUrl, sourceHost, tld, itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, - noUpdate + noUpdate, + publishStatus, publishAccount FROM feeds `) if err != nil { @@ -316,14 +391,15 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) { // GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { rows, err := c.db.Query(` - SELECT url, type, title, description, language, siteUrl, + SELECT url, type, category, title, description, language, siteUrl, discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, etag, lastModified, ttlMinutes, updatePeriod, updateFreq, status, errorCount, lastError, lastErrorAt, sourceUrl, sourceHost, tld, itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, - noUpdate + noUpdate, + publishStatus, publishAccount FROM feeds WHERE nextCrawlAt <= datetime('now') AND status != 'dead' ORDER BY RANDOM() @@ -340,14 +416,15 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { // GetFeedsByHost returns all feeds from a specific host func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { rows, err := c.db.Query(` - SELECT url, type, title, description, language, siteUrl, + SELECT url, type, category, title, description, language, siteUrl, discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, etag, lastModified, ttlMinutes, updatePeriod, updateFreq, status, errorCount, lastError, lastErrorAt, sourceUrl, sourceHost, tld, itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, - noUpdate + noUpdate, + publishStatus, publishAccount FROM feeds WHERE sourceHost = ? `, host) if err != nil { @@ -361,14 +438,15 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { // SearchFeeds performs a full-text search on feeds func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { rows, err := c.db.Query(` - SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl, + SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl, f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate, f.etag, f.lastModified, f.ttlMinutes, f.updatePeriod, f.updateFreq, f.status, f.errorCount, f.lastError, f.lastErrorAt, f.sourceUrl, f.sourceHost, f.tld, f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate, - f.noUpdate + f.noUpdate, + f.publishEnabled, f.publishAccount FROM feeds f JOIN feeds_fts fts ON f.rowid = fts.rowid WHERE feeds_fts MATCH ? @@ -387,13 +465,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) { for rows.Next() { feed := &Feed{} - var title, description, language, siteURL sql.NullString + var category, title, description, language, siteURL sql.NullString var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString var avgPostFreqHrs sql.NullFloat64 + var publishStatus, publishAccount sql.NullString if err := rows.Scan( - &feed.URL, &feed.Type, &title, &description, &language, &siteURL, + &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, &etag, &lastModified, &feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq, @@ -401,11 +480,17 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) { &sourceURL, &sourceHost, &tld, &feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &feed.NoUpdate, + &publishStatus, &publishAccount, ); err != nil { continue } // Handle nullable fields + if category.Valid { + feed.Category = category.String + } else { + feed.Category = "main" + } if title.Valid { feed.Title = title.String } @@ -460,6 +545,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) { if newestItemDate.Valid { feed.NewestItemDate = newestItemDate.Time } + if publishStatus.Valid { + feed.PublishStatus = publishStatus.String + } else { + feed.PublishStatus = "held" + } + if publishAccount.Valid { + feed.PublishAccount = publishAccount.String + } feeds = append(feeds, feed) } @@ -469,9 +562,27 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) { // saveItem stores an item in SQLite (upsert by feedUrl + guid) func (c *Crawler) saveItem(item *Item) error { + // Serialize enclosure fields + var enclosureUrl, enclosureType sql.NullString + var enclosureLength sql.NullInt64 + if item.Enclosure != nil { + enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""} + enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""} + enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0} + } + + // Serialize imageUrls as JSON + var imageUrlsJSON sql.NullString + if len(item.ImageURLs) > 0 { + if data, err := json.Marshal(item.ImageURLs); err == nil { + imageUrlsJSON = sql.NullString{String: string(data), Valid: true} + } + } + _, err := c.db.Exec(` - INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt, + enclosureUrl, enclosureType, enclosureLength, imageUrls) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(feedUrl, guid) DO UPDATE SET title = excluded.title, link = excluded.link, @@ -479,11 +590,16 @@ func (c *Crawler) saveItem(item *Item) error { content = excluded.content, author = excluded.author, pubDate = excluded.pubDate, - updatedAt = excluded.updatedAt + updatedAt = excluded.updatedAt, + enclosureUrl = excluded.enclosureUrl, + enclosureType = excluded.enclosureType, + enclosureLength = excluded.enclosureLength, + imageUrls = excluded.imageUrls `, item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link), nullString(item.Description), nullString(item.Content), nullString(item.Author), nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt), + enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, ) return err } @@ -501,8 +617,9 @@ func (c *Crawler) saveItems(items []*Item) error { defer tx.Rollback() stmt, err := tx.Prepare(` - INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt, + enclosureUrl, enclosureType, enclosureLength, imageUrls) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(feedUrl, guid) DO UPDATE SET title = excluded.title, link = excluded.link, @@ -510,7 +627,11 @@ func (c *Crawler) saveItems(items []*Item) error { content = excluded.content, author = excluded.author, pubDate = excluded.pubDate, - updatedAt = excluded.updatedAt + updatedAt = excluded.updatedAt, + enclosureUrl = excluded.enclosureUrl, + enclosureType = excluded.enclosureType, + enclosureLength = excluded.enclosureLength, + imageUrls = excluded.imageUrls `) if err != nil { return err @@ -521,10 +642,29 @@ func (c *Crawler) saveItems(items []*Item) error { if item == nil || item.GUID == "" { continue // Skip nil items or items without GUID } + + // Serialize enclosure fields + var enclosureUrl, enclosureType sql.NullString + var enclosureLength sql.NullInt64 + if item.Enclosure != nil { + enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""} + enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""} + enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0} + } + + // Serialize imageUrls as JSON + var imageUrlsJSON sql.NullString + if len(item.ImageURLs) > 0 { + if data, err := json.Marshal(item.ImageURLs); err == nil { + imageUrlsJSON = sql.NullString{String: string(data), Valid: true} + } + } + _, err := stmt.Exec( item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link), nullString(item.Description), nullString(item.Content), nullString(item.Author), nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt), + enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, ) if err != nil { continue // Skip failed items @@ -537,7 +677,9 @@ func (c *Crawler) saveItems(items []*Item) error { // GetItemsByFeed returns all items for a specific feed func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) { rows, err := c.db.Query(` - SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt + SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt, + enclosureUrl, enclosureType, enclosureLength, imageUrls, + publishedAt, publishedUri FROM items WHERE feedUrl = ? ORDER BY pubDate DESC @@ -548,55 +690,15 @@ func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) { } defer rows.Close() - var items []*Item - for rows.Next() { - item := &Item{} - var guid, title, link, description, content, author sql.NullString - var pubDate, updatedAt sql.NullTime - - if err := rows.Scan( - &item.ID, &item.FeedURL, &guid, &title, &link, - &description, &content, &author, &pubDate, - &item.DiscoveredAt, &updatedAt, - ); err != nil { - continue - } - - if guid.Valid { - item.GUID = guid.String - } - if title.Valid { - item.Title = title.String - } - if link.Valid { - item.Link = link.String - } - if description.Valid { - item.Description = description.String - } - if content.Valid { - item.Content = content.String - } - if author.Valid { - item.Author = author.String - } - if pubDate.Valid { - item.PubDate = pubDate.Time - } - if updatedAt.Valid { - item.UpdatedAt = updatedAt.Time - } - - items = append(items, item) - } - - return items, rows.Err() + return scanItems(rows) } // SearchItems performs a full-text search on items func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) { rows, err := c.db.Query(` - SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt + SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt, + i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls, + i.publishedAt, i.publishedUri FROM items i JOIN items_fts fts ON i.id = fts.rowid WHERE items_fts MATCH ? @@ -608,16 +710,27 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) { } defer rows.Close() + return scanItems(rows) +} + +// scanItems is a helper to scan multiple item rows +func scanItems(rows *sql.Rows) ([]*Item, error) { var items []*Item for rows.Next() { item := &Item{} var guid, title, link, description, content, author sql.NullString - var pubDate, updatedAt sql.NullTime + var pubDate, updatedAt, publishedAt sql.NullTime + var enclosureUrl, enclosureType sql.NullString + var enclosureLength sql.NullInt64 + var imageUrlsJSON sql.NullString + var publishedUri sql.NullString if err := rows.Scan( &item.ID, &item.FeedURL, &guid, &title, &link, &description, &content, &author, &pubDate, &item.DiscoveredAt, &updatedAt, + &enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, + &publishedAt, &publishedUri, ); err != nil { continue } @@ -647,6 +760,32 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) { item.UpdatedAt = updatedAt.Time } + // Parse enclosure + if enclosureUrl.Valid && enclosureUrl.String != "" { + item.Enclosure = &Enclosure{ + URL: enclosureUrl.String, + Type: enclosureType.String, + } + if enclosureLength.Valid { + item.Enclosure.Length = enclosureLength.Int64 + } + } + + // Parse imageUrls JSON + if imageUrlsJSON.Valid && imageUrlsJSON.String != "" { + var urls []string + if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil { + item.ImageURLs = urls + } + } + + if publishedAt.Valid { + item.PublishedAt = publishedAt.Time + } + if publishedUri.Valid { + item.PublishedUri = publishedUri.String + } + items = append(items, item) } @@ -667,10 +806,6 @@ func (c *Crawler) CleanupOldItems() (int64, error) { // processFeed parses and stores a feed with full metadata func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) { - if strings.Contains(feedURL, "/comment") { - return - } - // Fast path: check without lock if c.feedExists(feedURL) { return @@ -690,6 +825,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea feed := &Feed{ URL: normalizeURL(feedURL), Type: feedType, + Category: classifyFeed(feedURL), DiscoveredAt: now, LastCrawledAt: now, Status: "active", @@ -708,6 +844,9 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea items = c.parseAtomMetadata(body, feed) } + // Refine category based on parsed title (e.g., "Comments on:") + feed.Category = classifyFeedByTitle(feed.Title, feed.Category) + // Calculate next crawl time feed.NextCrawlAt = c.calculateNextCrawl(feed) @@ -723,11 +862,6 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea // addFeed adds a discovered feed URL (not yet fetched) func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { - // Skip comment, category, and article feeds - if skip, _ := shouldSkipFeed(feedURL); skip { - return - } - // Fast path: check without lock if c.feedExists(feedURL) { return @@ -746,6 +880,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { feed := &Feed{ URL: normalizedURL, Type: feedType, + Category: classifyFeed(feedURL), DiscoveredAt: now, Status: "active", SourceURL: normalizeURL(sourceURL), @@ -896,3 +1031,103 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { return true, nil } + +// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail') +// If status is 'pass', the account handle is also set (auto-derived if empty) +func (c *Crawler) SetPublishStatus(feedURL, status, account string) error { + feedURL = normalizeURL(feedURL) + + // Auto-derive account if passing and not provided + if status == "pass" && account == "" { + account = DeriveHandleFromFeed(feedURL) + } + + _, err := c.db.Exec(` + UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ? + `, status, nullString(account), feedURL) + return err +} + +// GetFeedsByPublishStatus returns all feeds with a specific publish status +func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { + rows, err := c.db.Query(` + SELECT url, type, category, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate, + publishStatus, publishAccount + FROM feeds + WHERE publishStatus = ? + `, status) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanFeeds(rows) +} + +// GetPublishCandidates returns feeds that are held review and have items +func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) { + rows, err := c.db.Query(` + SELECT url, type, category, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate, + publishStatus, publishAccount + FROM feeds + WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active' + ORDER BY itemCount DESC + LIMIT ? + `, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanFeeds(rows) +} + +// GetUnpublishedItems returns items for a feed that haven't been published yet +func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) { + rows, err := c.db.Query(` + SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt, + enclosureUrl, enclosureType, enclosureLength, imageUrls, + publishedAt, publishedUri + FROM items + WHERE feedUrl = ? AND publishedAt IS NULL + ORDER BY pubDate ASC + LIMIT ? + `, feedURL, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanItems(rows) +} + +// MarkItemPublished marks an item as published with the given URI +func (c *Crawler) MarkItemPublished(itemID int64, uri string) error { + _, err := c.db.Exec(` + UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ? + `, uri, itemID) + return err +} + +// GetUnpublishedItemCount returns the count of unpublished items for a feed +func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) { + var count int + err := c.db.QueryRow(` + SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL + `, feedURL).Scan(&count) + return count, err +} diff --git a/main.go b/main.go index f552e5f..b65a7ad 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,8 @@ package main import ( "fmt" "os" + "os/signal" + "syscall" ) func main() { @@ -17,7 +19,10 @@ func main() { fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err) os.Exit(1) } - defer crawler.Close() + + // Setup graceful shutdown + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start dashboard in background go func() { @@ -41,9 +46,24 @@ func main() { // Stats loop (background) - updates once per minute go crawler.StartStatsLoop() - // Cleanup loop (background) - removes old items once per hour + // Cleanup loop (background) - removes old items once per week go crawler.StartCleanupLoop() - // Crawl loop (foreground - blocks forever) - crawler.StartCrawlLoop() + // Maintenance loop (background) - WAL checkpoints and integrity checks + go crawler.StartMaintenanceLoop() + + // Crawl loop (background) + go crawler.StartCrawlLoop() + + // Wait for shutdown signal + sig := <-sigChan + fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig) + + // Close crawler (checkpoints WAL and closes database) + if err := crawler.Close(); err != nil { + fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err) + os.Exit(1) + } + + fmt.Println("Shutdown complete") } diff --git a/parser.go b/parser.go index 9b91798..e77c72a 100644 --- a/parser.go +++ b/parser.go @@ -3,6 +3,7 @@ package main import ( "encoding/xml" "fmt" + "regexp" "strings" "time" ) @@ -23,17 +24,52 @@ type RSSChannel struct { UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"` UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"` Items []RSSItem `xml:"item"` + // iTunes podcast namespace + ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` + ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"` + ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"` + ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"` } type RSSItem struct { - Title string `xml:"title"` - Link string `xml:"link"` - GUID string `xml:"guid"` - Description string `xml:"description"` - Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` - Author string `xml:"author"` - Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` - PubDate string `xml:"pubDate"` + Title string `xml:"title"` + Link string `xml:"link"` + GUID string `xml:"guid"` + Description string `xml:"description"` + Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + Author string `xml:"author"` + Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + PubDate string `xml:"pubDate"` + Enclosure *RSSEnclosure `xml:"enclosure"` + // iTunes item elements + ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"` + ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"` + ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"` + // Media RSS elements + MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"` + MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"` +} + +// MediaContent represents a media:content element +type MediaContent struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` + Medium string `xml:"medium,attr"` // image, video, audio + Width int `xml:"width,attr"` + Height int `xml:"height,attr"` +} + +// MediaThumbnail represents a media:thumbnail element +type MediaThumbnail struct { + URL string `xml:"url,attr"` + Width int `xml:"width,attr"` + Height int `xml:"height,attr"` +} + +type RSSEnclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` + Length int64 `xml:"length,attr"` } // Atom structs for parsing @@ -70,6 +106,43 @@ type AtomLink struct { Type string `xml:"type,attr"` } +// isPodcast checks if an RSS feed is a podcast based on content +func isPodcast(ch RSSChannel) bool { + // Check for iTunes namespace elements at channel level + if ch.ITunesAuthor != "" || ch.ITunesOwner != "" || + ch.ITunesExplicit != "" || ch.ITunesType != "" { + return true + } + + // Check items for audio enclosures or iTunes elements + audioCount := 0 + for _, item := range ch.Items { + // Check for iTunes duration or episode number + if item.ITunesDuration != "" || item.ITunesEpisode > 0 { + return true + } + // Check for audio/video enclosure + if item.Enclosure != nil && item.Enclosure.URL != "" { + mimeType := strings.ToLower(item.Enclosure.Type) + if strings.HasPrefix(mimeType, "audio/") || + strings.HasPrefix(mimeType, "video/") || + strings.Contains(mimeType, "mpeg") || + strings.Contains(mimeType, "mp3") || + strings.Contains(mimeType, "mp4") || + strings.Contains(mimeType, "m4a") || + strings.Contains(mimeType, "ogg") { + audioCount++ + } + } + } + // If more than half the items have audio enclosures, it's a podcast + if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 { + return true + } + + return false +} + func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { var rss RSS if err := xml.Unmarshal([]byte(body), &rss); err != nil { @@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { } ch := rss.Channel + feed.Title = ch.Title feed.Description = ch.Description feed.Language = ch.Language @@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { feed.UpdateFreq = ch.UpdateFreq feed.ItemCount = len(ch.Items) + // Detect podcast + if isPodcast(ch) { + feed.Category = "podcast" + } + // Parse lastBuildDate if ch.LastBuildDate != "" { if t, err := parseRSSDate(ch.LastBuildDate); err == nil { @@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { } } + // Map enclosure + if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" { + item.Enclosure = &Enclosure{ + URL: rssItem.Enclosure.URL, + Type: rssItem.Enclosure.Type, + Length: rssItem.Enclosure.Length, + } + } + + // Extract images from various sources + item.ImageURLs = extractItemImages(rssItem) + items = append(items, item) } @@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time { // Default: crawl every 6 hours return now.Add(6 * time.Hour) } + +// extractItemImages extracts image URLs from an RSS item +// Sources: media:content, media:thumbnail, iTunes image, and tags in HTML +func extractItemImages(rssItem RSSItem) []string { + seen := make(map[string]bool) + var images []string + + addImage := func(url string) { + url = strings.TrimSpace(url) + if url == "" || seen[url] { + return + } + // Basic validation + if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") { + return + } + seen[url] = true + images = append(images, url) + } + + // 1. Media RSS content (prefer larger images) + for _, mc := range rssItem.MediaContent { + if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) { + addImage(mc.URL) + } + } + + // 2. Media RSS thumbnails + for _, mt := range rssItem.MediaThumbnail { + if mt.URL != "" { + addImage(mt.URL) + } + } + + // 3. iTunes image + if rssItem.ITunesImage != "" { + addImage(rssItem.ITunesImage) + } + + // 4. Image enclosure + if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") { + addImage(rssItem.Enclosure.URL) + } + + // 5. Extract tags from description and content + htmlImages := extractImgTags(rssItem.Description) + htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...) + for _, img := range htmlImages { + addImage(img) + } + + return images +} + +// extractImgTags extracts src URLs from tags in HTML +func extractImgTags(html string) []string { + if html == "" { + return nil + } + + var urls []string + + // Simple regex to find img src attributes + // Matches: src="..." or src='...' + imgRegex := regexp.MustCompile(`]+src\s*=\s*["']([^"']+)["']`) + matches := imgRegex.FindAllStringSubmatch(html, -1) + + for _, match := range matches { + if len(match) > 1 { + url := strings.TrimSpace(match[1]) + // Skip data URIs, tracking pixels, and tiny images + if strings.HasPrefix(url, "data:") { + continue + } + // Skip common tracking/spacer images + if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") || + strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") { + continue + } + urls = append(urls, url) + } + } + + return urls +} diff --git a/publisher.go b/publisher.go new file mode 100644 index 0000000..d62577c --- /dev/null +++ b/publisher.go @@ -0,0 +1,909 @@ +package main + +import ( + "bytes" + "crypto/sha256" + "encoding/base32" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "time" +) + +// Publisher handles posting items to AT Protocol PDS +type Publisher struct { + pdsHost string + httpClient *http.Client +} + +// PDSSession holds authentication info for a PDS account +type PDSSession struct { + DID string `json:"did"` + Handle string `json:"handle"` + AccessJwt string `json:"accessJwt"` + RefreshJwt string `json:"refreshJwt"` +} + +// BskyPost represents an app.bsky.feed.post record +type BskyPost struct { + Type string `json:"$type"` + Text string `json:"text"` + CreatedAt string `json:"createdAt"` + Facets []BskyFacet `json:"facets,omitempty"` + Embed *BskyEmbed `json:"embed,omitempty"` +} + +type BskyFacet struct { + Index BskyByteSlice `json:"index"` + Features []BskyFeature `json:"features"` +} + +type BskyByteSlice struct { + ByteStart int `json:"byteStart"` + ByteEnd int `json:"byteEnd"` +} + +type BskyFeature struct { + Type string `json:"$type"` + URI string `json:"uri,omitempty"` +} + +type BskyEmbed struct { + Type string `json:"$type"` + External *BskyExternal `json:"external,omitempty"` + Images []BskyImage `json:"images,omitempty"` +} + +type BskyExternal struct { + URI string `json:"uri"` + Title string `json:"title"` + Description string `json:"description"` + Thumb *BlobRef `json:"thumb,omitempty"` +} + +type BskyImage struct { + Alt string `json:"alt"` + Image *BlobRef `json:"image"` +} + +// NewPublisher creates a new Publisher instance +func NewPublisher(pdsHost string) *Publisher { + return &Publisher{ + pdsHost: pdsHost, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// CreateSession authenticates with the PDS and returns a session +func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) { + payload := map[string]string{ + "identifier": handle, + "password": password, + } + body, err := json.Marshal(payload) + if err != nil { + return nil, err + } + + resp, err := p.httpClient.Post( + p.pdsHost+"/xrpc/com.atproto.server.createSession", + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody)) + } + + var session PDSSession + if err := json.NewDecoder(resp.Body).Decode(&session); err != nil { + return nil, err + } + + return &session, nil +} + +// CreateAccount creates a new account on the PDS +// Requires an invite code if the PDS has invites enabled +func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) { + payload := map[string]interface{}{ + "handle": handle, + "email": email, + "password": password, + } + if inviteCode != "" { + payload["inviteCode"] = inviteCode + } + + body, err := json.Marshal(payload) + if err != nil { + return nil, err + } + + resp, err := p.httpClient.Post( + p.pdsHost+"/xrpc/com.atproto.server.createAccount", + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody)) + } + + var session PDSSession + if err := json.Unmarshal(respBody, &session); err != nil { + return nil, err + } + + return &session, nil +} + +// CreateInviteCode creates an invite code using PDS admin password (Basic Auth) +func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) { + payload := map[string]interface{}{ + "useCount": useCount, + } + + body, err := json.Marshal(payload) + if err != nil { + return "", err + } + + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + // PDS admin APIs use Basic Auth with "admin" as username + req.SetBasicAuth("admin", adminPassword) + + resp, err := p.httpClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody)) + } + + var result struct { + Code string `json:"code"` + } + if err := json.Unmarshal(respBody, &result); err != nil { + return "", err + } + + return result.Code, nil +} + +// GenerateRkey creates a deterministic rkey from a GUID and timestamp +// Uses a truncated base32-encoded SHA256 hash +// Including the timestamp allows regenerating a new rkey by updating discoveredAt +func GenerateRkey(guid string, timestamp time.Time) string { + if guid == "" { + return "" + } + + // Combine GUID with timestamp for the hash input + // Format timestamp to second precision for consistency + input := guid + "|" + timestamp.UTC().Format(time.RFC3339) + hash := sha256.Sum256([]byte(input)) + // Use first 10 bytes (80 bits) - plenty for uniqueness + // Base32 encode without padding, lowercase for rkey compatibility + encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(hash[:10]) + return strings.ToLower(encoded) +} + +// extractURLs finds all URLs in a string +func extractURLs(text string) []string { + // Match http:// or https:// URLs + urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`) + matches := urlRegex.FindAllString(text, -1) + + // Clean up trailing punctuation + var urls []string + for _, u := range matches { + // Remove trailing punctuation that's likely not part of the URL + u = strings.TrimRight(u, ".,;:!?") + if u != "" { + urls = append(urls, u) + } + } + return urls +} + +// PublishItem posts a feed item to the PDS +// Returns the AT URI of the created record, or error +func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) { + if item.GUID == "" && item.Link == "" { + return "", fmt.Errorf("item has no GUID or link, cannot publish") + } + + // Collect all unique URLs: main link + any URLs in description + urlSet := make(map[string]bool) + var allURLs []string + + // Add main link first + if item.Link != "" { + urlSet[item.Link] = true + allURLs = append(allURLs, item.Link) + } + + // Add enclosure URL for podcasts/media (audio/video) + if item.Enclosure != nil && item.Enclosure.URL != "" { + encType := strings.ToLower(item.Enclosure.Type) + if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") { + if !urlSet[item.Enclosure.URL] { + urlSet[item.Enclosure.URL] = true + allURLs = append(allURLs, item.Enclosure.URL) + } + } + } + + // Extract URLs from description + descURLs := extractURLs(item.Description) + for _, u := range descURLs { + if !urlSet[u] { + urlSet[u] = true + allURLs = append(allURLs, u) + } + } + + // Extract URLs from content if available + contentURLs := extractURLs(item.Content) + for _, u := range contentURLs { + if !urlSet[u] { + urlSet[u] = true + allURLs = append(allURLs, u) + } + } + + // Build post text: title + all links + // Bluesky has 300 grapheme limit + var textBuilder strings.Builder + textBuilder.WriteString(item.Title) + + for _, u := range allURLs { + textBuilder.WriteString("\n\n") + textBuilder.WriteString(u) + } + + text := textBuilder.String() + + // Truncate title if text is too long (keep URLs intact) + const maxLen = 300 + if len(text) > maxLen { + // Calculate space needed for URLs + urlSpace := 0 + for _, u := range allURLs { + urlSpace += len(u) + 2 // +2 for \n\n + } + + maxTitleLen := maxLen - urlSpace - 3 // -3 for "..." + if maxTitleLen > 10 { + text = item.Title[:maxTitleLen] + "..." + for _, u := range allURLs { + text += "\n\n" + u + } + } + } + + // Use item's pubDate for createdAt, fall back to now + createdAt := time.Now() + if !item.PubDate.IsZero() { + createdAt = item.PubDate + } + + post := BskyPost{ + Type: "app.bsky.feed.post", + Text: text, + CreatedAt: createdAt.Format(time.RFC3339), + } + + // Add facets for all URLs + for _, u := range allURLs { + linkStart := strings.Index(text, u) + if linkStart >= 0 { + // Use byte positions (for UTF-8 this matters) + byteStart := len(text[:linkStart]) + byteEnd := byteStart + len(u) + + post.Facets = append(post.Facets, BskyFacet{ + Index: BskyByteSlice{ + ByteStart: byteStart, + ByteEnd: byteEnd, + }, + Features: []BskyFeature{ + { + Type: "app.bsky.richtext.facet#link", + URI: u, + }, + }, + }) + } + } + + // Decide embed type based on content + // Priority: images > external link card + if len(item.ImageURLs) > 0 { + // Try to upload images (up to 4) + uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title) + if len(uploadedImages) > 0 { + post.Embed = &BskyEmbed{ + Type: "app.bsky.embed.images", + Images: uploadedImages, + } + } + } + + // Fall back to external embed if no images were uploaded + if post.Embed == nil && len(allURLs) > 0 { + external := &BskyExternal{ + URI: allURLs[0], + Title: item.Title, + Description: truncate(stripHTML(item.Description), 300), + } + + // Try to add thumbnail from first image + if len(item.ImageURLs) > 0 { + if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil { + external.Thumb = thumb + } + } + + post.Embed = &BskyEmbed{ + Type: "app.bsky.embed.external", + External: external, + } + } + + // Use GUID + discoveredAt for deterministic rkey + // This allows regenerating a new rkey by updating discoveredAt if needed + guidForRkey := item.GUID + if guidForRkey == "" { + guidForRkey = item.Link + } + rkey := GenerateRkey(guidForRkey, item.DiscoveredAt) + + // Create the record with deterministic rkey + payload := map[string]interface{}{ + "repo": session.DID, + "collection": "app.bsky.feed.post", + "rkey": rkey, + "record": post, + } + + body, err := json.Marshal(payload) + if err != nil { + return "", err + } + + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+session.AccessJwt) + + resp, err := p.httpClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody)) + } + + var result struct { + URI string `json:"uri"` + CID string `json:"cid"` + } + if err := json.Unmarshal(respBody, &result); err != nil { + return "", err + } + + return result.URI, nil +} + +// uploadImages fetches and uploads up to 4 images, returning BskyImage structs +func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage { + var images []BskyImage + maxImages := 4 + if len(imageURLs) < maxImages { + maxImages = len(imageURLs) + } + + for i := 0; i < maxImages; i++ { + blob := p.fetchAndUploadImage(session, imageURLs[i]) + if blob != nil { + images = append(images, BskyImage{ + Alt: altText, + Image: blob, + }) + } + } + + return images +} + +// fetchAndUploadImage downloads an image and uploads it to the PDS +func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef { + // Fetch the image + resp, err := p.httpClient.Get(imageURL) + if err != nil { + return nil + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil + } + + // Check content type + contentType := resp.Header.Get("Content-Type") + if contentType == "" { + // Try to guess from URL + if strings.HasSuffix(strings.ToLower(imageURL), ".png") { + contentType = "image/png" + } else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") { + contentType = "image/gif" + } else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") { + contentType = "image/webp" + } else { + contentType = "image/jpeg" // Default + } + } + + // Only accept image types + if !strings.HasPrefix(contentType, "image/") { + return nil + } + + // Read image data (limit to 1MB to avoid issues) + data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024)) + if err != nil || len(data) == 0 { + return nil + } + + // Upload to PDS + blob, err := p.UploadBlob(session, data, contentType) + if err != nil { + return nil + } + + return blob +} + +func truncate(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen-3] + "..." +} + +// stripHTML removes HTML tags from a string +func stripHTML(s string) string { + // Remove HTML tags + tagRegex := regexp.MustCompile(`<[^>]*>`) + s = tagRegex.ReplaceAllString(s, "") + + // Decode common HTML entities + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + s = strings.ReplaceAll(s, """, "\"") + s = strings.ReplaceAll(s, "'", "'") + s = strings.ReplaceAll(s, " ", " ") + + // Collapse whitespace + spaceRegex := regexp.MustCompile(`\s+`) + s = spaceRegex.ReplaceAllString(s, " ") + + return strings.TrimSpace(s) +} + +// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL +// Format: {combined-path-and-hostname}.1440.news +// The PDS limits subdomains to 18 characters, so we prioritize meaningful parts +// Example: news.ycombinator.com/showrss → show-ycombinator.1440.news +func DeriveHandleFromFeed(feedURL string) string { + const maxSubdomainLen = 18 + + // Ensure we have a scheme for parsing + if !strings.Contains(feedURL, "://") { + feedURL = "https://" + feedURL + } + + u, err := url.Parse(feedURL) + if err != nil { + return "" + } + + hostname := strings.ToLower(u.Hostname()) + path := strings.ToLower(u.Path) + + // Remove common feed suffixes/extensions + suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"} + for _, suffix := range suffixesToRemove { + path = strings.TrimSuffix(path, suffix) + } + + // Split path into segments + segments := strings.Split(strings.Trim(path, "/"), "/") + + // Filter out common feed-related words + skipWords := map[string]bool{ + "rss": true, "feed": true, "feeds": true, "atom": true, + "xml": true, "default": true, "index": true, "services": true, + "nyt": true, // NYTimes uses /services/xml/rss/nyt/ + } + + var pathParts []string + for _, seg := range segments { + seg = cleanHandleSegment(seg) + if seg != "" && !skipWords[seg] { + pathParts = append(pathParts, seg) + } + } + + // Split hostname into parts, drop common TLDs to save space + hostParts := strings.Split(hostname, ".") + commonTLDs := map[string]bool{ + "com": true, "org": true, "net": true, "io": true, "co": true, + "edu": true, "gov": true, "uk": true, "de": true, "fr": true, + } + + // Remove TLD if it's common (to save characters) + if len(hostParts) > 1 && commonTLDs[hostParts[len(hostParts)-1]] { + hostParts = hostParts[:len(hostParts)-1] + } + + // Build subdomain: path parts first (they differentiate feeds), then host parts + // Priority order for fitting in 18 chars: + // 1. Main hostname part (e.g., "ycombinator") + // 2. Path prefix (e.g., "show") + // 3. Hostname subdomain (e.g., "news") + + var subdomain string + + // Start with the main hostname (usually the second-to-last part, or first if only one) + mainHost := hostParts[len(hostParts)-1] + if len(hostParts) > 1 { + mainHost = hostParts[len(hostParts)-1] // e.g., "ycombinator" from "news.ycombinator" + } + + // If path parts exist, prepend them + if len(pathParts) > 0 { + subdomain = pathParts[0] + "-" + mainHost + } else if len(hostParts) > 1 { + // No path, use subdomain-hostname (e.g., "news-ycombinator") + subdomain = hostParts[0] + "-" + mainHost + } else { + subdomain = mainHost + } + + // If still too long, just use main hostname + if len(subdomain) > maxSubdomainLen { + subdomain = mainHost + } + + // Final safety: truncate if still too long + if len(subdomain) > maxSubdomainLen { + subdomain = subdomain[:maxSubdomainLen] + } + + subdomain = strings.Trim(subdomain, "-") + + // Collapse multiple hyphens + for strings.Contains(subdomain, "--") { + subdomain = strings.ReplaceAll(subdomain, "--", "-") + } + + return subdomain + ".1440.news" +} + +// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment +// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens +func cleanHandleSegment(s string) string { + // Remove file extensions + if idx := strings.LastIndex(s, "."); idx > 0 { + s = s[:idx] + } + + // Convert to lowercase + s = strings.ToLower(s) + + // Strip common feed prefixes/suffixes from the segment itself + // e.g., "showrss" → "show", "rssworld" → "world" + feedAffixes := []string{"rss", "feed", "atom", "xml"} + for _, affix := range feedAffixes { + // Strip suffix (e.g., "showrss" → "show") + if strings.HasSuffix(s, affix) && len(s) > len(affix) { + s = strings.TrimSuffix(s, affix) + break + } + // Strip prefix (e.g., "rssworld" → "world") + if strings.HasPrefix(s, affix) && len(s) > len(affix) { + s = strings.TrimPrefix(s, affix) + break + } + } + + // Replace underscores and other separators with hyphens + s = strings.ReplaceAll(s, "_", "-") + s = strings.ReplaceAll(s, " ", "-") + + // Remove any characters that aren't alphanumeric or hyphens + reg := regexp.MustCompile(`[^a-z0-9-]`) + s = reg.ReplaceAllString(s, "") + + // Collapse multiple hyphens + for strings.Contains(s, "--") { + s = strings.ReplaceAll(s, "--", "-") + } + + // Trim leading/trailing hyphens + s = strings.Trim(s, "-") + + return s +} + +// SplitHandle extracts the path prefix and hostname from a derived handle +// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com") +func SplitHandle(handle string) (prefix string, hostname string) { + // Remove .1440.news suffix + handle = strings.TrimSuffix(handle, ".1440.news") + + parts := strings.Split(handle, ".") + + // Try to find where hostname starts by looking for valid hostname patterns + if len(parts) >= 2 { + for i := 0; i < len(parts)-1; i++ { + remaining := strings.Join(parts[i:], ".") + if looksLikeHostname(remaining) { + if i > 0 { + prefix = strings.Join(parts[:i], ".") + } + hostname = remaining + return + } + } + } + + // Fallback: no prefix, entire thing is hostname + hostname = handle + return "", hostname +} + +func isLikelyTLDPart(s string) bool { + tlds := map[string]bool{ + "com": true, "org": true, "net": true, "edu": true, "gov": true, + "io": true, "co": true, "uk": true, "de": true, "fr": true, + "jp": true, "au": true, "ca": true, "nl": true, "se": true, + "news": true, "blog": true, "tech": true, "dev": true, + } + return tlds[s] +} + +func isTwoPartTLD(first, second string) bool { + twoPartTLDs := map[string]bool{ + "co.uk": true, "com.au": true, "co.jp": true, "co.nz": true, + "org.uk": true, "net.au": true, "com.br": true, + } + return twoPartTLDs[first+"."+second] +} + +func looksLikeHostname(s string) bool { + // A hostname typically has at least one dot and ends with a TLD-like part + parts := strings.Split(s, ".") + if len(parts) < 2 { + return false + } + lastPart := parts[len(parts)-1] + return isLikelyTLDPart(lastPart) +} + +// BlobRef represents a blob reference for profile images +type BlobRef struct { + Type string `json:"$type"` + Ref Link `json:"ref"` + MimeType string `json:"mimeType"` + Size int64 `json:"size"` +} + +type Link struct { + Link string `json:"$link"` +} + +// UploadBlob uploads an image to the PDS and returns a blob reference +func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) { + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data)) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", mimeType) + req.Header.Set("Authorization", "Bearer "+session.AccessJwt) + + resp, err := p.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody)) + } + + var result struct { + Blob BlobRef `json:"blob"` + } + if err := json.Unmarshal(respBody, &result); err != nil { + return nil, err + } + + return &result.Blob, nil +} + +// UpdateProfile updates the profile for an account +func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error { + // First, get the current profile to preserve any existing fields + getReq, err := http.NewRequest("GET", + p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self", + nil) + if err != nil { + return err + } + getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt) + + getResp, err := p.httpClient.Do(getReq) + + var existingCID string + profile := map[string]interface{}{ + "$type": "app.bsky.actor.profile", + } + + if err == nil && getResp.StatusCode == http.StatusOK { + defer getResp.Body.Close() + var existing struct { + CID string `json:"cid"` + Value map[string]interface{} `json:"value"` + } + if json.NewDecoder(getResp.Body).Decode(&existing) == nil { + existingCID = existing.CID + profile = existing.Value + } + } else if getResp != nil { + getResp.Body.Close() + } + + // Update fields + if displayName != "" { + profile["displayName"] = displayName + } + if description != "" { + profile["description"] = description + } + if avatar != nil { + profile["avatar"] = avatar + } + + // Put the record + payload := map[string]interface{}{ + "repo": session.DID, + "collection": "app.bsky.actor.profile", + "rkey": "self", + "record": profile, + } + if existingCID != "" { + payload["swapRecord"] = existingCID + } + + body, err := json.Marshal(payload) + if err != nil { + return err + } + + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+session.AccessJwt) + + resp, err := p.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody)) + } + + return nil +} + +// FetchFavicon downloads a favicon/icon from a URL +func FetchFavicon(siteURL string) ([]byte, string, error) { + // Try common favicon locations + if !strings.HasPrefix(siteURL, "http") { + siteURL = "https://" + siteURL + } + + u, err := url.Parse(siteURL) + if err != nil { + return nil, "", err + } + + baseURL := u.Scheme + "://" + u.Host + + // Try apple-touch-icon first (usually higher quality) + iconURLs := []string{ + baseURL + "/apple-touch-icon.png", + baseURL + "/apple-touch-icon-precomposed.png", + baseURL + "/favicon.png", + baseURL + "/favicon.ico", + } + + client := &http.Client{Timeout: 10 * time.Second} + + for _, iconURL := range iconURLs { + resp, err := client.Get(iconURL) + if err != nil { + continue + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + continue + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + continue + } + + // Determine mime type + contentType := resp.Header.Get("Content-Type") + if contentType == "" { + if strings.HasSuffix(iconURL, ".png") { + contentType = "image/png" + } else if strings.HasSuffix(iconURL, ".ico") { + contentType = "image/x-icon" + } else { + contentType = "image/png" // default + } + } + + return data, contentType, nil + } + + return nil, "", fmt.Errorf("no favicon found for %s", siteURL) +} diff --git a/static/dashboard.css b/static/dashboard.css index c5668cc..f6be04f 100644 --- a/static/dashboard.css +++ b/static/dashboard.css @@ -53,3 +53,31 @@ td { font-size: 13px; color: #ffffff; } #searchInput::placeholder { color: #555; } .search-host { margin-bottom: 10px; } .search-feed:hover { background: #1a1a1a; } + +/* Command buttons */ +.cmd-btn { + background: #1a1a1a; + border: 1px solid #333; + border-radius: 4px; + color: #0af; + padding: 6px 12px; + margin-right: 8px; + margin-bottom: 4px; + font-size: 13px; + font-family: monospace; + cursor: pointer; + transition: background 0.2s, border-color 0.2s; +} +.cmd-btn:hover { + background: #252525; + border-color: #0af; +} +.cmd-btn:active { + background: #0af; + color: #000; +} + +/* Visit link */ +.visit-link:hover { + color: #0af !important; +} diff --git a/static/dashboard.js b/static/dashboard.js index 796d452..37348d6 100644 --- a/static/dashboard.js +++ b/static/dashboard.js @@ -10,508 +10,643 @@ function initDashboard() { return div.innerHTML; } - // All domains state - let allDomainsOffset = 0; - let allDomainsLoading = false; - let allDomainsEnd = false; - let expandedDomain = null; - let expandedFeed = null; - const PAGE_SIZE = 100; - const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom + // Current filter state + let currentFilters = {}; + let infiniteScrollState = null; + let isLoadingMore = false; - // Search state - let searchTimeout = null; - let isSearching = false; + // Update command input to reflect current filters + function updateCommandInput() { + const parts = []; + if (currentFilters.tld) parts.push('tld:.' + currentFilters.tld); + if (currentFilters.domain) parts.push('domain:' + currentFilters.domain); + if (currentFilters.feedStatus) parts.push('feeds:' + currentFilters.feedStatus); + if (currentFilters.domainStatus) parts.push('domains:' + currentFilters.domainStatus); + document.getElementById('commandInput').value = parts.length > 0 ? parts.join(' ') : '/help'; + } - async function loadMoreDomains() { - if (allDomainsLoading || allDomainsEnd) return; + // Parse command into filters + function parseCommand(cmd) { + const filters = {}; + const parts = cmd.trim().toLowerCase().split(/\s+/); - allDomainsLoading = true; - const loadingEl = document.getElementById('allDomainsLoading'); - loadingEl.style.display = 'block'; - - try { - const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE); - const domains = await response.json(); - - if (!domains || domains.length === 0) { - allDomainsEnd = true; - loadingEl.style.display = 'none'; - return; + for (const part of parts) { + if (part.startsWith('tld:.') || part.startsWith('tld:')) { + filters.tld = part.replace('tld:.', '').replace('tld:', ''); + } else if (part.startsWith('domain:')) { + filters.domain = part.substring(7); + } else if (part.startsWith('feeds:')) { + filters.feedStatus = part.substring(6); + } else if (part.startsWith('domains:')) { + filters.domainStatus = part.substring(8); + } else if (part === 'active' || part === 'error' || part === 'dead') { + // Shorthand for feed status + filters.feedStatus = part; + } else if (part === 'unchecked' || part === 'checked') { + // Shorthand for domain status + filters.domainStatus = part; + } else if (part === '/tlds' || part === '/tld') { + filters.showTlds = true; + } else if (part === '/help') { + filters.help = true; } + } - const container = document.getElementById('allDomains'); - domains.forEach(d => { - const row = document.createElement('div'); - row.className = 'domain-row'; - row.innerHTML = - '
' + - '' + escapeHtml(d.host) + '' + - '' + commaFormat(d.feeds_found) + '' + - '
' + - ''; + return filters; + } - row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row)); - container.appendChild(row); + // Render breadcrumb based on current filters + function renderBreadcrumb() { + const breadcrumb = document.getElementById('breadcrumb'); + const parts = []; + + parts.push('home'); + + if (currentFilters.tld) { + parts.push('.' + escapeHtml(currentFilters.tld) + ''); + } + if (currentFilters.domain) { + parts.push('' + escapeHtml(currentFilters.domain) + ''); + } + if (currentFilters.feedStatus) { + parts.push('feeds:' + escapeHtml(currentFilters.feedStatus) + ''); + } + if (currentFilters.domainStatus) { + parts.push('domains:' + escapeHtml(currentFilters.domainStatus) + ''); + } + + breadcrumb.innerHTML = parts.join(' / '); + breadcrumb.style.display = parts.length > 1 ? 'block' : 'none'; + + // Add click handlers + breadcrumb.querySelectorAll('.bc-item').forEach(el => { + el.addEventListener('click', () => { + const action = el.dataset.action; + if (action === 'home') { + currentFilters = {}; + showHelp(); + } else if (action === 'tld') { + delete currentFilters.domain; + delete currentFilters.feedStatus; + delete currentFilters.domainStatus; + executeFilters(); + } else if (action === 'domain') { + delete currentFilters.feedStatus; + executeFilters(); + } }); + }); + } - allDomainsOffset += domains.length; - loadingEl.style.display = 'none'; + // Infinite scroll + function setupInfiniteScroll(loadMoreFn) { + infiniteScrollState = { loadMore: loadMoreFn, ended: false }; + } - // If we got fewer than PAGE_SIZE, we've reached the end - if (domains.length < PAGE_SIZE) { - allDomainsEnd = true; - } - } catch (err) { - console.error('Failed to load domains:', err); - } finally { - allDomainsLoading = false; + function clearInfiniteScroll() { + infiniteScrollState = null; + } + + function checkInfiniteScroll() { + if (!infiniteScrollState || infiniteScrollState.ended || isLoadingMore) return; + const scrollBottom = window.scrollY + window.innerHeight; + const docHeight = document.documentElement.scrollHeight; + if (docHeight - scrollBottom < 300) { + isLoadingMore = true; + infiniteScrollState.loadMore().finally(() => { + isLoadingMore = false; + }); } } - async function toggleDomainFeeds(host, rowEl) { - const feedsDiv = rowEl.querySelector('.domain-feeds'); + window.addEventListener('scroll', checkInfiniteScroll); - // Close previously expanded domain - if (expandedDomain && expandedDomain !== rowEl) { - expandedDomain.querySelector('.domain-feeds').style.display = 'none'; + // Render helpers + function renderDomainRow(d) { + let html = '
'; + html += '
'; + html += '' + escapeHtml(d.host) + ''; + html += ''; + html += '' + commaFormat(d.feed_count) + ' feeds'; + html += ''; + html += '
'; + if (d.status === 'error' && d.last_error) { + html += '
Error: ' + escapeHtml(d.last_error) + '
'; + } else if (d.status === 'unchecked') { + html += '
Pending...
'; + } + html += '
'; + return html; + } + + function renderFeedRow(f) { + let html = '
'; + html += '
'; + html += '' + escapeHtml(f.url) + ''; + html += ''; + html += '
'; + if (f.title) html += '
' + escapeHtml(f.title) + '
'; + if (f.source_host) { + html += '
from ' + escapeHtml(f.source_host) + ''; + html += '
'; + } + let statusParts = [f.type || 'unknown']; + if (f.status) { + const color = f.status === 'active' ? '#0a0' : (f.status === 'error' ? '#f66' : '#888'); + statusParts.push('' + escapeHtml(f.status) + ''); + } + if (f.item_count > 0) statusParts.push(commaFormat(f.item_count) + ' items'); + html += '
' + statusParts.join(' · ') + '
'; + if (f.error_count > 0 && f.last_error) { + html += '
Error (' + f.error_count + '): ' + escapeHtml(f.last_error) + '
'; + } + html += '
'; + return html; + } + + function renderTldRow(t) { + return '
' + + '.' + escapeHtml(t.tld) + '' + + '' + commaFormat(t.domain_count) + ' domains, ' + commaFormat(t.feed_count) + ' feeds
'; + } + + function attachDomainHandlers(container) { + container.querySelectorAll('.domain-row-cmd:not(.handled)').forEach(el => { + el.classList.add('handled'); + el.querySelector('.domain-name').addEventListener('click', () => { + currentFilters.domain = el.dataset.host; + if (!currentFilters.tld && el.dataset.tld) currentFilters.tld = el.dataset.tld; + delete currentFilters.domainStatus; + executeFilters(); + }); + el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a'); + el.addEventListener('mouseleave', () => el.style.background = 'transparent'); + const btn = el.querySelector('.revisit-btn'); + if (btn) { + btn.addEventListener('click', async (e) => { + e.stopPropagation(); + btn.disabled = true; + btn.textContent = '...'; + try { + await fetch('/api/revisitDomain?host=' + encodeURIComponent(btn.dataset.host)); + btn.textContent = 'queued'; + btn.style.color = '#0a0'; + } catch (err) { + btn.textContent = 'error'; + btn.style.color = '#f66'; + } + }); + } + }); + } + + function attachFeedHandlers(container) { + container.querySelectorAll('.feed-row-cmd:not(.handled)').forEach(el => { + el.classList.add('handled'); + el.querySelector('.feed-name').addEventListener('click', () => { + showFeedInfo(el.dataset.url); + }); + el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a'); + el.addEventListener('mouseleave', () => el.style.background = 'transparent'); + }); + } + + function attachTldHandlers(container) { + container.querySelectorAll('.tld-row:not(.handled)').forEach(el => { + el.classList.add('handled'); + el.addEventListener('click', () => { + currentFilters = { tld: el.dataset.tld }; + executeFilters(); + }); + el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a'); + el.addEventListener('mouseleave', () => el.style.background = 'transparent'); + }); + } + + // Show help + function showHelp() { + currentFilters = {}; + clearInfiniteScroll(); + updateCommandInput(); + renderBreadcrumb(); + document.getElementById('output').innerHTML = '
Type a command and press Enter

Examples:
tld:.com
tld:.com active
tld:.com domains:error
domain:example.com
'; + } + + // Show TLDs list + async function showTLDs() { + currentFilters = {}; + clearInfiniteScroll(); + updateCommandInput(); + document.getElementById('commandInput').value = '/tlds'; + renderBreadcrumb(); + + const output = document.getElementById('output'); + output.innerHTML = '
Loading TLDs...
'; + + try { + const response = await fetch('/api/tlds'); + const tlds = await response.json(); + + if (!tlds || tlds.length === 0) { + output.innerHTML = '
No TLDs found
'; + return; + } + + let html = '
'; + tlds.forEach(t => html += renderTldRow(t)); + html += '
'; + output.innerHTML = html; + attachTldHandlers(output.querySelector('.tld-list')); + } catch (err) { + output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
'; + } + } + + // Execute current filters + async function executeFilters() { + clearInfiniteScroll(); + updateCommandInput(); + renderBreadcrumb(); + + const output = document.getElementById('output'); + + // Determine what to show + const showFeeds = currentFilters.feedStatus || currentFilters.domain; + const showDomains = currentFilters.domainStatus || (!showFeeds && currentFilters.tld); + + if (!currentFilters.tld && !currentFilters.domain && !currentFilters.feedStatus && !currentFilters.domainStatus) { + showHelp(); + return; } - // Toggle current - if (feedsDiv.style.display === 'none') { - feedsDiv.style.display = 'block'; - feedsDiv.innerHTML = '
Loading feeds...
'; - expandedDomain = rowEl; + // Build API URL + const params = new URLSearchParams(); + if (currentFilters.tld) params.set('tld', currentFilters.tld); + if (currentFilters.domain) params.set('domain', currentFilters.domain); + if (currentFilters.feedStatus) params.set('feedStatus', currentFilters.feedStatus); + if (currentFilters.domainStatus) params.set('domainStatus', currentFilters.domainStatus); + if (showFeeds) params.set('show', 'feeds'); + else if (showDomains) params.set('show', 'domains'); + output.innerHTML = '
Loading...
'; + + let offset = 0; + const limit = 100; + + async function loadMore() { try { - const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host)); - const feeds = await response.json(); + params.set('limit', limit); + params.set('offset', offset); + const response = await fetch('/api/filter?' + params.toString()); + const result = await response.json(); - if (!feeds || feeds.length === 0) { - feedsDiv.innerHTML = '
No feeds found
'; + if (!result.data || result.data.length === 0) { + infiniteScrollState.ended = true; + document.getElementById('infiniteLoader').textContent = offset === 0 ? 'No results found' : 'End of list'; + return; + } + + const container = output.querySelector('.result-list'); + + if (result.type === 'domains') { + result.data.forEach(d => container.insertAdjacentHTML('beforeend', renderDomainRow(d))); + attachDomainHandlers(container); } else { - feedsDiv.innerHTML = ''; - feeds.forEach(f => { - const feedItem = document.createElement('div'); - feedItem.className = 'feed-item'; - feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;'; - feedItem.innerHTML = - '
' + - '
' + escapeHtml(f.url) + '
' + - (f.title ? '
' + escapeHtml(f.title) + '
' : '') + - '
' + (f.type || 'unknown') + '
' + - '
' + - ''; + result.data.forEach(f => container.insertAdjacentHTML('beforeend', renderFeedRow(f))); + attachFeedHandlers(container); + } - feedItem.querySelector('.feed-header').addEventListener('click', (e) => { - e.stopPropagation(); - toggleFeedInfo(f.url, feedItem); - }); - feedsDiv.appendChild(feedItem); - }); + offset += result.data.length; + + if (result.data.length < limit) { + infiniteScrollState.ended = true; + document.getElementById('infiniteLoader').textContent = 'End of list'; } } catch (err) { - feedsDiv.innerHTML = '
Error loading feeds
'; + document.getElementById('infiniteLoader').textContent = 'Error loading'; } - } else { - feedsDiv.style.display = 'none'; - expandedDomain = null; } + + await loadMore(); + setupInfiniteScroll(loadMore); } - async function toggleFeedInfo(feedUrl, feedItemEl) { - const detailsDiv = feedItemEl.querySelector('.feed-details'); + // Show feed info + async function showFeedInfo(feedUrl) { + clearInfiniteScroll(); + renderBreadcrumb(); - // Close previously expanded feed - if (expandedFeed && expandedFeed !== feedItemEl) { - expandedFeed.querySelector('.feed-details').style.display = 'none'; - } - - // Toggle current - if (detailsDiv.style.display === 'none') { - detailsDiv.style.display = 'block'; - detailsDiv.innerHTML = '
Loading feed info...
'; - expandedFeed = feedItemEl; - - // Scroll the feed item to the top of the viewport - feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' }); - - try { - // Fetch feed info and items in parallel - const [infoResponse, itemsResponse] = await Promise.all([ - fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)), - fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50') - ]); - const info = await infoResponse.json(); - const items = await itemsResponse.json(); - - let html = '
'; - - if (info.description) { - html += '
' + escapeHtml(info.description) + '
'; - } - - html += ''; - - if (info.siteUrl) { - html += ''; - } - if (info.language) { - html += ''; - } - if (info.status) { - html += ''; - } - if (info.itemCount) { - html += ''; - } - if (info.avgPostFreqHrs) { - html += ''; - } - if (info.ttlMinutes) { - html += ''; - } - if (info.updatePeriod) { - let updateStr = info.updatePeriod; - if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')'; - html += ''; - } - if (info.lastBuildDate) { - html += ''; - } - if (info.newestItemDate) { - html += ''; - } - if (info.oldestItemDate) { - html += ''; - } - if (info.discoveredAt) { - html += ''; - } - if (info.lastCrawledAt) { - html += ''; - } - if (info.errorCount > 0) { - html += ''; - } - if (info.lastError) { - html += ''; - } - - html += '
Site' + escapeHtml(info.siteUrl) + '
Language' + escapeHtml(info.language) + '
Status' + escapeHtml(info.status) + '
Items' + commaFormat(info.itemCount) + '
Avg Post Freq' + info.avgPostFreqHrs.toFixed(1) + ' hrs
TTL' + info.ttlMinutes + ' min
Update' + escapeHtml(updateStr) + '
Last Build' + escapeHtml(info.lastBuildDate) + '
Newest Item' + escapeHtml(info.newestItemDate) + '
Oldest Item' + escapeHtml(info.oldestItemDate) + '
Discovered' + escapeHtml(info.discoveredAt) + '
Last Crawled' + escapeHtml(info.lastCrawledAt) + '
Errors' + info.errorCount + '
Last Error' + escapeHtml(info.lastError) + '
'; - - // Display items - if (items && items.length > 0) { - html += '
'; - html += '
Recent Items (' + items.length + ')
'; - - items.forEach(item => { - html += '
'; - - // Title with link - if (item.title) { - if (item.link) { - html += ''; - } else { - html += '
' + escapeHtml(item.title) + '
'; - } - } else if (item.link) { - html += ''; - } - - // Metadata line (date, author) - let meta = []; - if (item.pub_date) { - const date = new Date(item.pub_date); - meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString()); - } - if (item.author) { - meta.push(escapeHtml(item.author)); - } - if (meta.length > 0) { - html += '
' + meta.join(' • ') + '
'; - } - - html += '
'; - }); - - html += '
'; - } - - html += '
'; - - detailsDiv.innerHTML = html; - } catch (err) { - detailsDiv.innerHTML = '
Error loading feed info
'; - } - } else { - detailsDiv.style.display = 'none'; - expandedFeed = null; - } - } - - // Infinite scroll handler with prefetch (uses window scroll) - function setupInfiniteScroll() { - window.addEventListener('scroll', () => { - // Check if we're near the bottom of the page - const scrollBottom = window.scrollY + window.innerHeight; - const docHeight = document.documentElement.scrollHeight; - const remainingPixels = docHeight - scrollBottom; - - // Prefetch when within 500px of the bottom - if (remainingPixels < 500) { - loadMoreDomains(); - } - }); - } - - // Search functionality - function setupSearch() { - const searchInput = document.getElementById('searchInput'); - const searchResults = document.getElementById('searchResults'); - const domainsContainer = document.getElementById('allDomainsContainer'); - - if (!searchInput || !searchResults || !domainsContainer) { - console.error('Search elements not found'); - return; - } - - searchInput.addEventListener('input', (e) => { - const query = e.target.value.trim(); - - // Clear previous timeout - if (searchTimeout) { - clearTimeout(searchTimeout); - } - - // If empty, show domains list - if (!query) { - searchResults.style.display = 'none'; - domainsContainer.style.display = 'block'; - isSearching = false; - return; - } - - // Debounce search - searchTimeout = setTimeout(() => performSearch(query), 300); - }); - - // Handle Enter key - searchInput.addEventListener('keydown', (e) => { - if (e.key === 'Enter') { - const query = e.target.value.trim(); - if (query) { - if (searchTimeout) clearTimeout(searchTimeout); - performSearch(query); - } - } - }); - } - - async function performSearch(query) { - const searchResults = document.getElementById('searchResults'); - const domainsContainer = document.getElementById('allDomainsContainer'); - - isSearching = true; - domainsContainer.style.display = 'none'; - searchResults.style.display = 'block'; - searchResults.innerHTML = '
Searching...
'; - - try { - const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200'); - const results = await response.json(); - - if (!results || results.length === 0) { - searchResults.innerHTML = '
No results found
'; - return; - } - - // Group results by host - const byHost = {}; - results.forEach(r => { - const host = r.feed.source_host || 'unknown'; - if (!byHost[host]) { - byHost[host] = []; - } - byHost[host].push(r); - }); - - // Render results - searchResults.innerHTML = ''; - - Object.keys(byHost).sort().forEach(host => { - const hostDiv = document.createElement('div'); - hostDiv.className = 'search-host'; - - // Host header - const hostHeader = document.createElement('div'); - hostHeader.className = 'stat-row'; - hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;'; - hostHeader.innerHTML = '' + escapeHtml(host) + '' + byHost[host].length + ' feed(s)'; - - const feedsContainer = document.createElement('div'); - feedsContainer.style.display = 'block'; - - byHost[host].forEach(result => { - const feedDiv = document.createElement('div'); - feedDiv.className = 'search-feed'; - feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;'; - - // Feed header - let feedHtml = '
' + escapeHtml(result.feed.url) + '
'; - if (result.feed.title) { - feedHtml += '
' + escapeHtml(result.feed.title) + '
'; - } - if (result.feed.description) { - feedHtml += '
' + escapeHtml(result.feed.description.substring(0, 200)) + '
'; - } - - // Items - if (result.items && result.items.length > 0) { - feedHtml += '
'; - result.items.forEach(item => { - feedHtml += '
'; - if (item.title) { - if (item.link) { - feedHtml += '' + escapeHtml(item.title) + ''; - } else { - feedHtml += '' + escapeHtml(item.title) + ''; - } - } - let meta = []; - if (item.pub_date) { - meta.push(item.pub_date.substring(0, 10)); - } - if (item.author) { - meta.push(escapeHtml(item.author)); - } - if (meta.length > 0) { - feedHtml += '
' + meta.join(' • ') + '
'; - } - feedHtml += '
'; - }); - feedHtml += '
'; - } - - feedDiv.innerHTML = feedHtml; - - // Click on feed URL to toggle full feed info - feedDiv.querySelector('.feed-url').addEventListener('click', () => { - toggleSearchFeedInfo(result.feed.url, feedDiv); - }); - - feedsContainer.appendChild(feedDiv); - }); - - hostHeader.addEventListener('click', () => { - feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none'; - }); - - hostDiv.appendChild(hostHeader); - hostDiv.appendChild(feedsContainer); - searchResults.appendChild(hostDiv); - }); - - } catch (err) { - console.error('Search failed:', err); - searchResults.innerHTML = '
Search failed: ' + escapeHtml(err.message) + '
'; - } - } - - async function toggleSearchFeedInfo(feedUrl, feedDiv) { - let detailsDiv = feedDiv.querySelector('.feed-details-expanded'); - - if (detailsDiv) { - detailsDiv.remove(); - return; - } - - detailsDiv = document.createElement('div'); - detailsDiv.className = 'feed-details-expanded'; - detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;'; - detailsDiv.innerHTML = '
Loading feed info...
'; - feedDiv.appendChild(detailsDiv); + const output = document.getElementById('output'); + output.innerHTML = '
Loading feed info...
'; try { const [infoResponse, itemsResponse] = await Promise.all([ fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)), - fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20') + fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=100') ]); const info = await infoResponse.json(); const items = await itemsResponse.json(); - let html = ''; - if (info.siteUrl) html += ''; - if (info.language) html += ''; - if (info.status) html += ''; - if (info.itemCount) html += ''; - if (info.avgPostFreqHrs) html += ''; - if (info.newestItemDate) html += ''; + let html = '
'; + html += '
' + escapeHtml(feedUrl) + '
'; + if (info.title) html += '
' + escapeHtml(info.title) + '
'; + if (info.description) html += '
' + escapeHtml(info.description) + '
'; + + html += '
Site' + escapeHtml(info.siteUrl) + '
Language' + escapeHtml(info.language) + '
Status' + escapeHtml(info.status) + '
Items' + commaFormat(info.itemCount) + '
Avg Freq' + info.avgPostFreqHrs.toFixed(1) + ' hrs
Newest' + escapeHtml(info.newestItemDate) + '
'; + const addRow = (label, value, color) => value ? '' : ''; + html += addRow('Type', info.type); + html += addRow('Status', info.status, info.status === 'active' ? '#0a0' : '#f66'); + html += addRow('Language', info.language); + html += addRow('Site URL', info.siteUrl); + html += addRow('Items', info.itemCount ? commaFormat(info.itemCount) : null); + html += addRow('Avg Post Freq', info.avgPostFreqHrs ? info.avgPostFreqHrs.toFixed(1) + ' hrs' : null); + html += addRow('Discovered', info.discoveredAt); + html += addRow('Last Crawled', info.lastCrawledAt); + if (info.errorCount > 0) { + html += addRow('Errors', info.errorCount, '#f66'); + html += addRow('Last Error', info.lastError, '#f66'); + } html += '
' + label + '' + escapeHtml(String(value)) + '
'; if (items && items.length > 0) { - html += '
'; - html += '
All Items (' + items.length + ')
'; + html += '
'; + html += '
Recent Items (' + items.length + ')
'; items.forEach(item => { - html += '
'; + html += '
'; if (item.title && item.link) { - html += '' + escapeHtml(item.title) + ''; + html += ''; } else if (item.title) { - html += '' + escapeHtml(item.title) + ''; + html += '
' + escapeHtml(item.title) + '
'; } + let meta = []; + if (item.pub_date) meta.push(new Date(item.pub_date).toLocaleDateString() + ' ' + new Date(item.pub_date).toLocaleTimeString()); + if (item.author) meta.push(escapeHtml(item.author)); + if (meta.length > 0) html += '
' + meta.join(' • ') + '
'; html += '
'; }); html += '
'; } - - detailsDiv.innerHTML = html; + html += '
'; + output.innerHTML = html; } catch (err) { - detailsDiv.innerHTML = '
Failed to load feed info
'; + output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
'; } } + // Highlight matching text + function highlightMatch(text, query) { + if (!text || !query) return escapeHtml(text); + const escaped = escapeHtml(text); + const regex = new RegExp('(' + query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + ')', 'gi'); + return escaped.replace(regex, '$1'); + } + + // Perform search + async function performSearch(query) { + currentFilters = {}; + clearInfiniteScroll(); + updateCommandInput(); + document.getElementById('commandInput').value = query; + renderBreadcrumb(); + + const output = document.getElementById('output'); + output.innerHTML = '
Searching...
'; + + try { + const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=500'); + const results = await response.json(); + + if (!results || results.length === 0) { + output.innerHTML = '
No results found
'; + return; + } + + let html = '
'; + html += '
' + results.length + ' feed(s) found
'; + + results.forEach(r => { + const f = r.feed; + html += '
'; + html += '
'; + html += '' + highlightMatch(f.url, query) + ''; + html += ''; + html += '
'; + if (f.title) html += '
' + highlightMatch(f.title, query) + '
'; + if (f.source_host) { + html += '
from ' + highlightMatch(f.source_host, query) + '
'; + } + let statusParts = []; + if (f.type) statusParts.push(f.type); + if (f.status) statusParts.push('' + f.status + ''); + if (f.item_count > 0) statusParts.push(commaFormat(f.item_count) + ' items'); + if (statusParts.length > 0) html += '
' + statusParts.join(' · ') + '
'; + + if (r.items && r.items.length > 0) { + html += '
'; + html += '
' + r.items.length + ' matching item(s)
'; + r.items.forEach(item => { + html += '
'; + if (item.title && item.link) html += '' + highlightMatch(item.title, query) + ''; + else if (item.title) html += '' + highlightMatch(item.title, query) + ''; + if (item.pub_date) html += '' + item.pub_date.substring(0, 10) + ''; + if (item.description) html += '
' + highlightMatch(item.description.substring(0, 300), query) + '
'; + html += '
'; + }); + html += '
'; + } + html += '
'; + }); + html += '
'; + output.innerHTML = html; + + output.querySelectorAll('.search-feed-url').forEach(el => { + el.addEventListener('click', () => showFeedInfo(el.dataset.url)); + }); + } catch (err) { + output.innerHTML = '
Search error: ' + escapeHtml(err.message) + '
'; + } + } + + // Show publish management view + async function showPublish() { + currentFilters = {}; + clearInfiniteScroll(); + updateCommandInput(); + document.getElementById('commandInput').value = '/publish'; + renderBreadcrumb(); + + const output = document.getElementById('output'); + output.innerHTML = '
Loading publish data...
'; + + try { + const [candidatesRes, passedRes] = await Promise.all([ + fetch('/api/publishCandidates?limit=50'), + fetch('/api/publishEnabled') + ]); + const candidates = await candidatesRes.json(); + const passed = await passedRes.json(); + + let html = '
'; + + // Passed feeds (approved for publishing) + html += '
'; + html += '
✓ Approved for Publishing (' + passed.length + ')
'; + if (passed.length === 0) { + html += '
No feeds approved yet
'; + } else { + passed.forEach(f => { + html += '
'; + html += '
'; + html += '
' + escapeHtml(f.title || f.url) + '
'; + html += '
' + escapeHtml(f.url) + '
'; + html += '
→ ' + escapeHtml(f.account) + ' (' + f.unpublished_count + ' unpublished)
'; + html += '
'; + html += ''; + html += '
'; + }); + } + html += '
'; + + // Candidates (held for review) + html += '
'; + html += '
⏳ Held for Review (' + candidates.length + ')
'; + if (candidates.length === 0) { + html += '
No candidates held
'; + } else { + candidates.forEach(f => { + html += '
'; + html += '
'; + html += '
'; + html += '
' + escapeHtml(f.title || f.url) + '
'; + html += '
' + escapeHtml(f.url) + '
'; + html += '
→ ' + escapeHtml(f.derived_handle) + '
'; + html += '
' + escapeHtml(f.source_host) + ' · ' + f.item_count + ' items · ' + escapeHtml(f.category) + '
'; + html += '
'; + html += ''; + html += ''; + html += '
'; + html += '
'; + }); + } + html += '
'; + + html += '
'; + output.innerHTML = html; + + // Attach handlers for pass/fail buttons + output.querySelectorAll('.status-btn').forEach(btn => { + btn.addEventListener('click', async () => { + const url = btn.dataset.url; + const status = btn.dataset.status; + btn.disabled = true; + btn.textContent = '...'; + try { + const response = await fetch('/api/setPublishStatus?url=' + encodeURIComponent(url) + '&status=' + status); + if (response.ok) { + // Refresh the view + showPublish(); + } else { + btn.textContent = 'Error'; + btn.style.background = '#600'; + } + } catch (err) { + btn.textContent = 'Error'; + btn.style.background = '#600'; + } + }); + }); + + } catch (err) { + output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
'; + } + } + + // Process command + function processCommand(cmd) { + const trimmed = cmd.trim(); + if (!trimmed || trimmed === '/help') { + showHelp(); + return; + } + if (trimmed === '/tlds' || trimmed === '/tld') { + showTLDs(); + return; + } + if (trimmed === '/publish') { + showPublish(); + return; + } + + // Check if it looks like a filter command + const hasFilter = trimmed.includes(':') || ['active', 'error', 'dead', 'unchecked', 'checked'].some(s => trimmed.toLowerCase().includes(s)); + + if (hasFilter) { + currentFilters = parseCommand(trimmed); + executeFilters(); + } else { + // Treat as search + performSearch(trimmed); + } + } + + // Setup command input + function setupCommandInput() { + const input = document.getElementById('commandInput'); + input.addEventListener('keydown', (e) => { if (e.key === 'Enter') processCommand(input.value); }); + input.addEventListener('focus', () => input.select()); + document.querySelectorAll('.cmd-btn').forEach(btn => { + btn.addEventListener('click', () => { + const cmd = btn.dataset.cmd; + // Special commands that reset filters + if (cmd === '/tlds' || cmd === '/publish') { + currentFilters = {}; + input.value = cmd; + processCommand(cmd); + return; + } + // Status buttons add to current filters + const btnFilters = parseCommand(cmd); + if (btnFilters.domainStatus) { + currentFilters.domainStatus = btnFilters.domainStatus; + delete currentFilters.feedStatus; // Can't have both + delete currentFilters.domain; // Show domains, not feeds for a domain + } + if (btnFilters.feedStatus) { + currentFilters.feedStatus = btnFilters.feedStatus; + delete currentFilters.domainStatus; // Can't have both + } + updateCommandInput(); + executeFilters(); + }); + }); + } + + // Stats update async function updateStats() { try { const response = await fetch('/api/stats'); const stats = await response.json(); - - // Update domain stats document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains); document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains); document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains); document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate); document.getElementById('checkRate').textContent = commaFormat(stats.check_rate); - - // Update progress bar - const progress = stats.total_domains > 0 - ? (stats.checked_domains * 100 / stats.total_domains).toFixed(1) - : 0; + const progress = stats.total_domains > 0 ? (stats.checked_domains * 100 / stats.total_domains).toFixed(1) : 0; document.getElementById('crawlProgress').style.width = progress + '%'; - - // Update feed stats document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds); document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds); document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds); document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds); - - // Update timestamp const updatedAt = new Date(stats.updated_at); - document.getElementById('updatedAt').textContent = 'Last updated: ' + - updatedAt.toISOString().replace('T', ' ').substring(0, 19); - + document.getElementById('updatedAt').textContent = 'Last updated: ' + updatedAt.toISOString().replace('T', ' ').substring(0, 19); } catch (err) { console.error('Failed to update stats:', err); } } // Initialize - try { - setupSearch(); - } catch (e) { - console.error('setupSearch failed:', e); - } - setupInfiniteScroll(); - loadMoreDomains(); + setupCommandInput(); + showHelp(); updateStats(); setInterval(updateStats, 1000); }