diff --git a/CLAUDE.md b/CLAUDE.md index ecdae5e..1b5f630 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -47,10 +47,9 @@ Multi-file Go application that crawls websites for RSS/Atom feeds, stores them i ### Concurrent Loops (main.go) -The application runs seven independent goroutine loops: +The application runs six independent goroutine loops: - **Import loop** - Reads `vertices.txt.gz` and inserts domains into DB in batches of 100 (status='pass') -- **Domain check loop** - HEAD requests to verify approved domains are reachable -- **Crawl loop** - Worker pool crawls verified domains for feed discovery +- **Crawl loop** - Worker pool crawls approved domains for feed discovery - **Feed check loop** - Worker pool re-checks known feeds for updates (conditional HTTP) - **Stats loop** - Updates cached dashboard statistics every minute - **Cleanup loop** - Removes items older than 12 months (weekly) @@ -78,7 +77,7 @@ The application runs seven independent goroutine loops: ### Database Schema PostgreSQL with pgx driver, using connection pooling: -- **domains** - Hosts to crawl (status: hold/pass/skip/fail) +- **domains** - Hosts to crawl (status: hold/pass/skip) - **feeds** - Discovered RSS/Atom feeds with metadata and cache headers (publish_status: hold/pass/skip) - **items** - Individual feed entries (guid + feed_url unique) - **search_vector** - GENERATED tsvector columns for full-text search (GIN indexed) @@ -88,11 +87,10 @@ Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`) ### Crawl Logic 1. Domains import as `pass` by default (auto-crawled) -2. Check stage: HEAD request verifies domain is reachable, sets last_checked_at -3. Crawl stage: Full recursive crawl (HTTPS, fallback HTTP) -4. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10 -5. Extract `` and anchor hrefs containing rss/atom/feed -6. Parse discovered feeds for metadata, save with next_crawl_at +2. Crawl loop picks up domains where `last_crawled_at IS NULL` +3. Full recursive crawl (HTTPS, fallback HTTP) up to MaxDepth=10, MaxPagesPerHost=10 +4. Extract `` and anchor hrefs containing rss/atom/feed +5. Parse discovered feeds for metadata, save with next_crawl_at ### Feed Checking @@ -103,17 +101,15 @@ Uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 1 Feeds with `publish_status = 'pass'` have their items automatically posted to AT Protocol. Status values: `hold` (default/pending review), `pass` (approved), `skip` (rejected). -### Domain Processing (Two-Stage) - -1. **Check stage** - HEAD request to verify domain is reachable -2. **Crawl stage** - Full recursive crawl for feed discovery +### Domain Processing Domain status values: - `pass` (default on import) - Domain is crawled and checked automatically - `hold` (manual) - Pauses crawling, keeps existing feeds and items - `skip` (manual) - Takes down PDS accounts (hides posts), marks feeds inactive, preserves all data - `drop` (manual, via button) - Permanently **deletes** all feeds, items, and PDS accounts (requires skip first) -- `fail` (automatic) - Set when check/crawl fails, keeps existing feeds and items + +Note: Errors during check/crawl are recorded in `last_error` but do not change the domain status. Skip vs Drop: - `skip` is reversible - use "un-skip" to restore accounts and resume publishing diff --git a/api_domains.go b/api_domains.go index 6ba1ee7..dd5c58a 100644 --- a/api_domains.go +++ b/api_domains.go @@ -50,6 +50,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { status := r.URL.Query().Get("status") hasFeeds := r.URL.Query().Get("has_feeds") == "true" search := r.URL.Query().Get("search") + tldFilter := r.URL.Query().Get("tld") limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { @@ -68,7 +69,22 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { if hasFeeds { // Only domains with feeds searchPattern := "%" + strings.ToLower(search) + "%" - if search != "" { + if tldFilter != "" { + // Filter by specific TLD + rows, err = c.db.Query(` + SELECT d.host, d.tld, d.status, d.last_error, f.feed_count + FROM domains d + INNER JOIN ( + SELECT source_host, COUNT(*) as feed_count + FROM feeds + WHERE item_count > 0 + GROUP BY source_host + ) f ON d.host = f.source_host + WHERE d.status != 'skip' AND d.tld = $1 + ORDER BY d.host ASC + LIMIT $2 OFFSET $3 + `, tldFilter, limit, offset) + } else if search != "" { // Search in domain host or feed title/url rows, err = c.db.Query(` SELECT DISTINCT d.host, d.tld, d.status, d.last_error, f.feed_count @@ -306,7 +322,7 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { } rows, err := c.db.Query(` - SELECT url, title, type, status, error_count, last_error, item_count, publish_status, language + SELECT url, title, type, status, last_error, item_count, publish_status, language FROM feeds WHERE source_host = $1 ORDER BY url ASC @@ -323,7 +339,6 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { Title string `json:"title"` Type string `json:"type"` Status string `json:"status,omitempty"` - ErrorCount int `json:"error_count,omitempty"` LastError string `json:"last_error,omitempty"` ItemCount int `json:"item_count,omitempty"` PublishStatus string `json:"publish_status,omitempty"` @@ -334,8 +349,8 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { for rows.Next() { var f FeedInfo var title, status, lastError, publishStatus, language *string - var errorCount, itemCount *int - if err := rows.Scan(&f.URL, &title, &f.Type, &status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil { + var itemCount *int + if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil { continue } f.Title = StringValue(title) @@ -343,9 +358,6 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { f.LastError = StringValue(lastError) f.PublishStatus = StringValue(publishStatus) f.Language = StringValue(language) - if errorCount != nil { - f.ErrorCount = *errorCount - } if itemCount != nil { f.ItemCount = *itemCount } @@ -357,7 +369,7 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { } // handleAPISetDomainStatus sets the status for a domain -// status must be 'hold', 'pass', 'skip', or 'fail' (use /api/dropDomain for 'drop') +// status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop') func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") status := r.URL.Query().Get("status") @@ -366,8 +378,8 @@ func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Reques http.Error(w, "host parameter required", http.StatusBadRequest) return } - if status != "hold" && status != "pass" && status != "skip" && status != "fail" { - http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest) + if status != "hold" && status != "pass" && status != "skip" { + http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest) return } @@ -839,9 +851,9 @@ func (c *Crawler) skipDomain(host string) DomainActionResult { } } - // Mark feeds as inactive (but don't delete) + // Mark feeds as skipped (but don't delete) feedsAffected, err := c.db.Exec(` - UPDATE feeds SET status = 'inactive', publish_status = 'skip' + UPDATE feeds SET status = 'skip', publish_status = 'skip' WHERE source_host = $1 `, host) if err != nil { @@ -999,9 +1011,9 @@ func (c *Crawler) restoreDomain(host string) DomainActionResult { } } - // Restore feeds to active status + // Restore feeds to pass status feedsAffected, err := c.db.Exec(` - UPDATE feeds SET status = 'active', publish_status = 'pass' + UPDATE feeds SET status = 'pass', publish_status = 'pass' WHERE source_host = $1 `, host) if err != nil { diff --git a/api_feeds.go b/api_feeds.go index f6f0db2..8b1db85 100644 --- a/api_feeds.go +++ b/api_feeds.go @@ -18,56 +18,48 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { } type FeedDetails struct { - URL string `json:"url"` - Type string `json:"type,omitempty"` - Category string `json:"category,omitempty"` - Title string `json:"title,omitempty"` - Description string `json:"description,omitempty"` - Language string `json:"language,omitempty"` - SiteURL string `json:"siteUrl,omitempty"` - DiscoveredAt string `json:"discoveredAt,omitempty"` - LastCrawledAt string `json:"lastCrawledAt,omitempty"` - NextCrawlAt string `json:"nextCrawlAt,omitempty"` - LastBuildDate string `json:"lastBuildDate,omitempty"` - TTLMinutes int `json:"ttlMinutes,omitempty"` - UpdatePeriod string `json:"updatePeriod,omitempty"` - UpdateFreq int `json:"updateFreq,omitempty"` - Status string `json:"status,omitempty"` - ErrorCount int `json:"errorCount,omitempty"` - LastError string `json:"lastError,omitempty"` - ItemCount int `json:"itemCount,omitempty"` - AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"` - OldestItemDate string `json:"oldestItemDate,omitempty"` - NewestItemDate string `json:"newestItemDate,omitempty"` - PublishStatus string `json:"publishStatus,omitempty"` - PublishAccount string `json:"publishAccount,omitempty"` + URL string `json:"url"` + Type string `json:"type,omitempty"` + Category string `json:"category,omitempty"` + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Language string `json:"language,omitempty"` + SiteURL string `json:"siteUrl,omitempty"` + DiscoveredAt string `json:"discoveredAt,omitempty"` + LastCrawledAt string `json:"lastCrawledAt,omitempty"` + NextCrawlAt string `json:"nextCrawlAt,omitempty"` + LastBuildDate string `json:"lastBuildDate,omitempty"` + Status string `json:"status,omitempty"` + LastError string `json:"lastError,omitempty"` + ItemCount int `json:"itemCount,omitempty"` + OldestItemDate string `json:"oldestItemDate,omitempty"` + NewestItemDate string `json:"newestItemDate,omitempty"` + PublishStatus string `json:"publishStatus,omitempty"` + PublishAccount string `json:"publishAccount,omitempty"` } var f FeedDetails var category, title, description, language, siteUrl *string var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time - var updatePeriod, status, lastError *string + var status, lastError *string var oldestItemDate, newestItemDate *time.Time - var ttlMinutes, updateFreq, errorCount, itemCount *int - var avgPostFreqHrs *float64 + var itemCount *int var discoveredAt time.Time var publishStatus, publishAccount *string err := c.db.QueryRow(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, + status, last_error, (SELECT COUNT(*) FROM items WHERE feed_url = feeds.url) as item_count, - avg_post_freq_hrs, oldest_item_date, newest_item_date, + oldest_item_date, newest_item_date, publish_status, publish_account FROM feeds WHERE url = $1 `, feedURL).Scan( &f.URL, &f.Type, &category, &title, &description, &language, &siteUrl, &discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, - &ttlMinutes, &updatePeriod, &updateFreq, - &status, &errorCount, &lastError, - &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + &status, &lastError, + &itemCount, &oldestItemDate, &newestItemDate, &publishStatus, &publishAccount, ) @@ -95,24 +87,11 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { if lastBuildDate != nil { f.LastBuildDate = lastBuildDate.Format(time.RFC3339) } - if ttlMinutes != nil { - f.TTLMinutes = *ttlMinutes - } - f.UpdatePeriod = StringValue(updatePeriod) - if updateFreq != nil { - f.UpdateFreq = *updateFreq - } f.Status = StringValue(status) - if errorCount != nil { - f.ErrorCount = *errorCount - } f.LastError = StringValue(lastError) if itemCount != nil { f.ItemCount = *itemCount } - if avgPostFreqHrs != nil { - f.AvgPostFreqHrs = *avgPostFreqHrs - } if oldestItemDate != nil { f.OldestItemDate = oldestItemDate.Format(time.RFC3339) } @@ -175,7 +154,7 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) } rows, err := c.db.Query(` - SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count + SELECT url, title, type, source_host, tld, status, last_error, item_count FROM feeds WHERE status = $1 ORDER BY url ASC @@ -194,7 +173,6 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) SourceHost string `json:"source_host"` TLD string `json:"tld"` Status string `json:"status"` - ErrorCount int `json:"error_count,omitempty"` LastError string `json:"last_error,omitempty"` ItemCount int `json:"item_count,omitempty"` } @@ -203,17 +181,14 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) for rows.Next() { var f FeedInfo var title, sourceHost, tld, lastError *string - var errorCount, itemCount *int - if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount); err != nil { + var itemCount *int + if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &lastError, &itemCount); err != nil { continue } f.Title = StringValue(title) f.SourceHost = StringValue(sourceHost) f.TLD = StringValue(tld) f.LastError = StringValue(lastError) - if errorCount != nil { - f.ErrorCount = *errorCount - } if itemCount != nil { f.ItemCount = *itemCount } @@ -243,7 +218,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { var err error if publishStatus != "" { rows, err = c.db.Query(` - SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language + SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language FROM feeds WHERE publish_status = $1 ORDER BY url ASC @@ -251,7 +226,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { `, publishStatus, limit, offset) } else { rows, err = c.db.Query(` - SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language + SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language FROM feeds ORDER BY url ASC LIMIT $1 OFFSET $2 @@ -270,7 +245,6 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { SourceHost string `json:"source_host"` TLD string `json:"tld"` Status string `json:"status"` - ErrorCount int `json:"error_count,omitempty"` LastError string `json:"last_error,omitempty"` ItemCount int `json:"item_count,omitempty"` PublishStatus string `json:"publish_status,omitempty"` @@ -281,8 +255,8 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { for rows.Next() { var f FeedInfo var title, sourceHost, tld, lastError, publishStatus, language *string - var errorCount, itemCount *int - if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil { + var itemCount *int + if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &lastError, &itemCount, &publishStatus, &language); err != nil { continue } f.Title = StringValue(title) @@ -291,9 +265,6 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { f.LastError = StringValue(lastError) f.PublishStatus = StringValue(publishStatus) f.Language = StringValue(language) - if errorCount != nil { - f.ErrorCount = *errorCount - } if itemCount != nil { f.ItemCount = *itemCount } @@ -308,7 +279,7 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, var args []interface{} argNum := 1 query := ` - SELECT url, title, type, category, source_host, tld, status, error_count, last_error, item_count, language + SELECT url, title, type, category, source_host, tld, status, last_error, item_count, language FROM feeds WHERE 1=1` @@ -360,7 +331,6 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, SourceHost string `json:"source_host"` TLD string `json:"tld"` Status string `json:"status"` - ErrorCount int `json:"error_count,omitempty"` LastError string `json:"last_error,omitempty"` ItemCount int `json:"item_count,omitempty"` Language string `json:"language,omitempty"` @@ -370,8 +340,8 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, for rows.Next() { var f FeedInfo var title, category, sourceHost, tldVal, lastError, language *string - var errorCount, itemCount *int - if err := rows.Scan(&f.URL, &title, &f.Type, &category, &sourceHost, &tldVal, &f.Status, &errorCount, &lastError, &itemCount, &language); err != nil { + var itemCount *int + if err := rows.Scan(&f.URL, &title, &f.Type, &category, &sourceHost, &tldVal, &f.Status, &lastError, &itemCount, &language); err != nil { continue } f.Title = StringValue(title) @@ -383,9 +353,6 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, f.SourceHost = StringValue(sourceHost) f.TLD = StringValue(tldVal) f.LastError = StringValue(lastError) - if errorCount != nil { - f.ErrorCount = *errorCount - } if itemCount != nil { f.ItemCount = *itemCount } diff --git a/api_search.go b/api_search.go index 7c7be69..2dc77c3 100644 --- a/api_search.go +++ b/api_search.go @@ -16,32 +16,27 @@ type SearchResult struct { } type SearchFeed struct { - URL string `json:"url"` - Type string `json:"type"` - Category string `json:"category"` - Title string `json:"title"` - Description string `json:"description"` - Language string `json:"language"` - SiteURL string `json:"site_url"` - DiscoveredAt string `json:"discovered_at"` - LastCrawledAt string `json:"last_crawled_at"` - NextCrawlAt string `json:"next_crawl_at"` - LastBuildDate string `json:"last_build_date"` - TTLMinutes int `json:"ttl_minutes"` - UpdatePeriod string `json:"update_period"` - UpdateFreq int `json:"update_freq"` - Status string `json:"status"` - ErrorCount int `json:"error_count"` - LastError string `json:"last_error"` - LastErrorAt string `json:"last_error_at"` - SourceURL string `json:"source_url"` - SourceHost string `json:"source_host"` - TLD string `json:"tld"` - ItemCount int `json:"item_count"` - AvgPostFreqHrs float64 `json:"avg_post_freq_hrs"` - OldestItemDate string `json:"oldest_item_date"` - NewestItemDate string `json:"newest_item_date"` - NoUpdate bool `json:"no_update"` + URL string `json:"url"` + Type string `json:"type"` + Category string `json:"category"` + Title string `json:"title"` + Description string `json:"description"` + Language string `json:"language"` + SiteURL string `json:"site_url"` + DiscoveredAt string `json:"discovered_at"` + LastCrawledAt string `json:"last_crawled_at"` + NextCrawlAt string `json:"next_crawl_at"` + LastBuildDate string `json:"last_build_date"` + Status string `json:"status"` + LastError string `json:"last_error"` + LastErrorAt string `json:"last_error_at"` + SourceURL string `json:"source_url"` + SourceHost string `json:"source_host"` + TLD string `json:"tld"` + ItemCount int `json:"item_count"` + OldestItemDate string `json:"oldest_item_date"` + NewestItemDate string `json:"newest_item_date"` + NoUpdate bool `json:"no_update"` } type SearchItem struct { @@ -82,20 +77,18 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { var feedType, category, title, description, language, siteUrl *string var discoveredAt time.Time var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time - var ttlMinutes, updateFreq, errorCount, itemCount *int - var updatePeriod, status, lastError *string + var itemCount *int + var status, lastError *string var lastErrorAt *time.Time var sourceUrl, sourceHost, tld *string - var avgPostFreqHrs *float64 var oldestItemDate, newestItemDate *time.Time var noUpdate *bool if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl, &discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, - &ttlMinutes, &updatePeriod, &updateFreq, - &status, &errorCount, &lastError, &lastErrorAt, + &status, &lastError, &lastErrorAt, &sourceUrl, &sourceHost, &tld, - &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &noUpdate); err != nil { + &itemCount, &oldestItemDate, &newestItemDate, &noUpdate); err != nil { return "", SearchFeed{}, false } cat := StringValue(category) @@ -111,7 +104,6 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { Language: StringValue(language), SiteURL: StringValue(siteUrl), DiscoveredAt: discoveredAt.Format(time.RFC3339), - UpdatePeriod: StringValue(updatePeriod), Status: StringValue(status), LastError: StringValue(lastError), SourceURL: StringValue(sourceUrl), @@ -127,24 +119,12 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { if lastBuildDate != nil { sf.LastBuildDate = lastBuildDate.Format(time.RFC3339) } - if ttlMinutes != nil { - sf.TTLMinutes = *ttlMinutes - } - if updateFreq != nil { - sf.UpdateFreq = *updateFreq - } - if errorCount != nil { - sf.ErrorCount = *errorCount - } if lastErrorAt != nil { sf.LastErrorAt = lastErrorAt.Format(time.RFC3339) } if itemCount != nil { sf.ItemCount = *itemCount } - if avgPostFreqHrs != nil { - sf.AvgPostFreqHrs = *avgPostFreqHrs - } if oldestItemDate != nil { sf.OldestItemDate = oldestItemDate.Format(time.RFC3339) } @@ -161,10 +141,9 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { hostRows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update + item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE source_host ILIKE $1 OR url ILIKE $1 LIMIT $2 @@ -185,10 +164,9 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { feedRows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update + item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE search_vector @@ to_tsquery('english', $1) LIMIT $2 @@ -251,28 +229,25 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string var fDiscoveredAt time.Time var fLastCrawledAt, fNextCrawlAt, fLastBuildDate *time.Time - var fTTLMinutes, fUpdateFreq, fErrorCount, fItemCount *int - var fUpdatePeriod, fStatus, fLastError *string + var fItemCount *int + var fStatus, fLastError *string var fLastErrorAt *time.Time var fSourceUrl, fSourceHost, fTLD *string - var fAvgPostFreqHrs *float64 var fOldestItemDate, fNewestItemDate *time.Time var fNoUpdate *bool c.db.QueryRow(` SELECT type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update + item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE url = $1 `, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl, &fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate, - &fTTLMinutes, &fUpdatePeriod, &fUpdateFreq, - &fStatus, &fErrorCount, &fLastError, &fLastErrorAt, + &fStatus, &fLastError, &fLastErrorAt, &fSourceUrl, &fSourceHost, &fTLD, - &fItemCount, &fAvgPostFreqHrs, &fOldestItemDate, &fNewestItemDate, &fNoUpdate) + &fItemCount, &fOldestItemDate, &fNewestItemDate, &fNoUpdate) fCat := StringValue(fCategory) if fCat == "" { @@ -287,7 +262,6 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { Language: StringValue(fLang), SiteURL: StringValue(fSiteUrl), DiscoveredAt: fDiscoveredAt.Format(time.RFC3339), - UpdatePeriod: StringValue(fUpdatePeriod), Status: StringValue(fStatus), LastError: StringValue(fLastError), SourceURL: StringValue(fSourceUrl), @@ -303,24 +277,12 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { if fLastBuildDate != nil { sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339) } - if fTTLMinutes != nil { - sf.TTLMinutes = *fTTLMinutes - } - if fUpdateFreq != nil { - sf.UpdateFreq = *fUpdateFreq - } - if fErrorCount != nil { - sf.ErrorCount = *fErrorCount - } if fLastErrorAt != nil { sf.LastErrorAt = fLastErrorAt.Format(time.RFC3339) } if fItemCount != nil { sf.ItemCount = *fItemCount } - if fAvgPostFreqHrs != nil { - sf.AvgPostFreqHrs = *fAvgPostFreqHrs - } if fOldestItemDate != nil { sf.OldestItemDate = fOldestItemDate.Format(time.RFC3339) } diff --git a/crawler.go b/crawler.go index f15ec1e..9cbf157 100644 --- a/crawler.go +++ b/crawler.go @@ -293,7 +293,7 @@ func (c *Crawler) getAccountForFeed(feedURL string) string { var account *string err := c.db.QueryRow(` SELECT publish_account FROM feeds - WHERE url = $1 AND publish_status = 'pass' AND status = 'active' + WHERE url = $1 AND publish_status = 'pass' AND status = 'pass' `, feedURL).Scan(&account) if err != nil || account == nil || *account == "" { // Derive handle from feed URL @@ -406,7 +406,7 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { FROM items i JOIN feeds f ON i.feed_url = f.url WHERE f.publish_status = 'pass' - AND f.status = 'active' + AND f.status = 'pass' AND i.published_at IS NULL ORDER BY i.discovered_at ASC LIMIT $1 @@ -467,84 +467,7 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { return items, nil } -// StartDomainCheckLoop runs HEAD requests on approved domains to verify they're reachable -func (c *Crawler) StartDomainCheckLoop() { - numWorkers := 100 - - // Buffered channel for domain work - workChan := make(chan *Domain, 100) - - // Start workers - for i := 0; i < numWorkers; i++ { - go func() { - for domain := range workChan { - // Do HEAD request to verify domain is reachable - checkErr := c.checkDomain(domain.Host) - errStr := "" - if checkErr != nil { - errStr = checkErr.Error() - } - if err := c.markDomainChecked(domain.Host, errStr); err != nil { - fmt.Printf("Error marking domain %s as checked: %v\n", domain.Host, err) - } - } - }() - } - - const fetchSize = 100 - for { - domains, err := c.GetDomainsToCheck(fetchSize) - if err != nil { - fmt.Printf("Error fetching domains to check: %v\n", err) - } - - if len(domains) == 0 { - time.Sleep(1 * time.Second) - continue - } - - fmt.Printf("%s domain-check: %d domains to verify\n", time.Now().Format("15:04:05"), len(domains)) - - for _, domain := range domains { - workChan <- domain - } - - time.Sleep(1 * time.Second) - } -} - -// checkDomain performs a HEAD request to verify a domain is reachable -func (c *Crawler) checkDomain(host string) error { - url := "https://" + host - req, err := http.NewRequest("HEAD", url, nil) - if err != nil { - return err - } - req.Header.Set("User-Agent", c.UserAgent) - - resp, err := c.client.Do(req) - if err != nil { - // Try HTTP fallback - url = "http://" + host - req, err = http.NewRequest("HEAD", url, nil) - if err != nil { - return err - } - req.Header.Set("User-Agent", c.UserAgent) - resp, err = c.client.Do(req) - if err != nil { - return err - } - } - defer resp.Body.Close() - - if resp.StatusCode >= 400 { - return fmt.Errorf("HTTP %d", resp.StatusCode) - } - return nil -} - -// StartCrawlLoop runs the domain crawling loop independently (crawls checked domains) +// StartCrawlLoop runs the domain crawling loop independently func (c *Crawler) StartCrawlLoop() { numWorkers := 100 diff --git a/dashboard.go b/dashboard.go index 3d88d0f..4cda8f2 100644 --- a/dashboard.go +++ b/dashboard.go @@ -12,7 +12,6 @@ type DashboardStats struct { HoldDomains int `json:"hold_domains"` PassDomains int `json:"pass_domains"` SkipDomains int `json:"skip_domains"` - FailDomains int `json:"fail_domains"` // Feed stats TotalFeeds int `json:"total_feeds"` @@ -187,8 +186,6 @@ func (c *Crawler) collectDomainStats(stats *DashboardStats) error { stats.PassDomains = count case "skip": stats.SkipDomains = count - case "fail": - stats.FailDomains = count } } if err := rows.Err(); err != nil { diff --git a/db.go b/db.go index d6725fd..c816f80 100644 --- a/db.go +++ b/db.go @@ -27,8 +27,7 @@ CREATE TABLE IF NOT EXISTS domains ( CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status); CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld); CREATE INDEX IF NOT EXISTS idx_domains_feeds_found ON domains(feeds_found DESC) WHERE feeds_found > 0; -CREATE INDEX IF NOT EXISTS idx_domains_to_check ON domains(status) WHERE last_checked_at IS NULL; -CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(status) WHERE last_checked_at IS NOT NULL AND last_crawled_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(status) WHERE last_crawled_at IS NULL; CREATE TABLE IF NOT EXISTS feeds ( url TEXT PRIMARY KEY, @@ -47,12 +46,7 @@ CREATE TABLE IF NOT EXISTS feeds ( etag TEXT, last_modified TEXT, - ttl_minutes INTEGER, - update_period TEXT, - update_freq INTEGER, - - status TEXT DEFAULT 'active', - error_count INTEGER DEFAULT 0, + status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')), last_error TEXT, last_error_at TIMESTAMPTZ, @@ -61,7 +55,6 @@ CREATE TABLE IF NOT EXISTS feeds ( tld TEXT, item_count INTEGER, - avg_post_freq_hrs DOUBLE PRECISION, oldest_item_date TIMESTAMPTZ, newest_item_date TIMESTAMPTZ, @@ -90,6 +83,7 @@ CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at); CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector); +CREATE INDEX IF NOT EXISTS idx_feeds_due_check ON feeds(next_crawl_at, no_update DESC) WHERE status = 'pass'; CREATE TABLE IF NOT EXISTS items ( id BIGSERIAL PRIMARY KEY, diff --git a/domain.go b/domain.go index 0f96fc7..2eba749 100644 --- a/domain.go +++ b/domain.go @@ -15,7 +15,7 @@ import ( ) // Domain represents a host to be crawled for feeds -// Status: hold (pending review), pass (approved), skip (not processing), fail (error) +// Status: hold (pending review), pass (approved), skip (not processing) type Domain struct { Host string `json:"host"` Status string `json:"status"` @@ -123,28 +123,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) { return domain, nil } -// GetDomainsToCheck returns domains ready for checking (status='pass', never checked) -func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) { - rows, err := c.db.Query(` - SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld - FROM domains WHERE status = 'pass' AND last_checked_at IS NULL - ORDER BY discovered_at ASC - LIMIT $1 - `, limit) - if err != nil { - return nil, err - } - defer rows.Close() - - return c.scanDomains(rows) -} - -// GetDomainsToCrawl returns domains ready for crawling (status='pass', checked but not crawled) +// GetDomainsToCrawl returns domains ready for crawling (status='pass', not yet crawled) func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) { rows, err := c.db.Query(` SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld - FROM domains WHERE status = 'pass' AND last_checked_at IS NOT NULL AND last_crawled_at IS NULL - ORDER BY discovered_at ASC + FROM domains WHERE status = 'pass' AND last_crawled_at IS NULL + ORDER BY discovered_at DESC LIMIT $1 `, limit) if err != nil { @@ -180,29 +164,12 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) { return domains, rows.Err() } -// markDomainChecked updates a domain after the check (HEAD request) stage -func (c *Crawler) markDomainChecked(host string, lastError string) error { - now := time.Now() - if lastError != "" { - _, err := c.db.Exec(` - UPDATE domains SET status = 'fail', last_checked_at = $1, last_error = $2 - WHERE host = $3 - `, now, lastError, normalizeHost(host)) - return err - } - _, err := c.db.Exec(` - UPDATE domains SET last_checked_at = $1, last_error = NULL - WHERE host = $2 - `, now, normalizeHost(host)) - return err -} - // markDomainCrawled updates a domain after the crawl stage func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error { now := time.Now() if lastError != "" { _, err := c.db.Exec(` - UPDATE domains SET status = 'fail', last_crawled_at = $1, feeds_found = $2, last_error = $3 + UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = $3 WHERE host = $4 `, now, feedsFound, lastError, normalizeHost(host)) return err diff --git a/feed.go b/feed.go index eed7b6f..83cadd6 100644 --- a/feed.go +++ b/feed.go @@ -109,14 +109,8 @@ type Feed struct { ETag string `json:"etag,omitempty"` LastModified string `json:"last_modified,omitempty"` - // Feed hints for crawl scheduling - TTLMinutes int `json:"ttl_minutes,omitempty"` // From RSS element - UpdatePeriod string `json:"update_period,omitempty"` // From sy:updatePeriod (hourly, daily, weekly, monthly, yearly) - UpdateFreq int `json:"update_freq,omitempty"` // From sy:updateFrequency - // Health tracking - Status string `json:"status"` // "active", "dead", "redirect", "error" - ErrorCount int `json:"error_count"` + Status string `json:"status"` // "pass", "hold", "skip" LastError string `json:"last_error,omitempty"` LastErrorAt time.Time `json:"last_error_at,omitempty"` @@ -126,8 +120,7 @@ type Feed struct { TLD string `json:"tld,omitempty"` // Content stats - ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl - AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts + ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl OldestItemDate time.Time `json:"oldest_item_date,omitempty"` NewestItemDate time.Time `json:"newest_item_date,omitempty"` @@ -162,13 +155,12 @@ func (c *Crawler) saveFeed(feed *Feed) error { url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30) + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25) ON CONFLICT(url) DO UPDATE SET type = EXCLUDED.type, category = EXCLUDED.category, @@ -181,15 +173,10 @@ func (c *Crawler) saveFeed(feed *Feed) error { last_build_date = EXCLUDED.last_build_date, etag = EXCLUDED.etag, last_modified = EXCLUDED.last_modified, - ttl_minutes = EXCLUDED.ttl_minutes, - update_period = EXCLUDED.update_period, - update_freq = EXCLUDED.update_freq, status = EXCLUDED.status, - error_count = EXCLUDED.error_count, last_error = EXCLUDED.last_error, last_error_at = EXCLUDED.last_error_at, item_count = EXCLUDED.item_count, - avg_post_freq_hrs = EXCLUDED.avg_post_freq_hrs, oldest_item_date = EXCLUDED.oldest_item_date, newest_item_date = EXCLUDED.newest_item_date, no_update = EXCLUDED.no_update, @@ -200,10 +187,9 @@ func (c *Crawler) saveFeed(feed *Feed) error { NullableString(feed.Language), NullableString(feed.SiteURL), feed.DiscoveredAt, NullableTime(feed.LastCrawledAt), NullableTime(feed.NextCrawlAt), NullableTime(feed.LastBuildDate), NullableString(feed.ETag), NullableString(feed.LastModified), - feed.TTLMinutes, NullableString(feed.UpdatePeriod), feed.UpdateFreq, - feed.Status, feed.ErrorCount, NullableString(feed.LastError), NullableTime(feed.LastErrorAt), + feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt), NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD), - feed.ItemCount, feed.AvgPostFreqHrs, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate), + feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate), feed.NoUpdate, publishStatus, NullableString(feed.PublishAccount), ) @@ -215,19 +201,17 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed := &Feed{} var category, title, description, language, siteURL *string var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time - var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld *string - var avgPostFreqHrs *float64 + var etag, lastModified, lastError, sourceURL, sourceHost, tld *string var publishStatus, publishAccount *string - var ttlMinutes, updateFreq, errorCount, itemCount, noUpdate *int + var itemCount, noUpdate *int err := c.db.QueryRow(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE url = $1 @@ -235,10 +219,9 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, &etag, &lastModified, - &ttlMinutes, &updatePeriod, &updateFreq, - &feed.Status, &errorCount, &lastError, &lastErrorAt, + &feed.Status, &lastError, &lastErrorAt, &sourceURL, &sourceHost, &tld, - &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, ) @@ -265,16 +248,6 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) - if ttlMinutes != nil { - feed.TTLMinutes = *ttlMinutes - } - feed.UpdatePeriod = StringValue(updatePeriod) - if updateFreq != nil { - feed.UpdateFreq = *updateFreq - } - if errorCount != nil { - feed.ErrorCount = *errorCount - } feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) @@ -283,9 +256,6 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { if itemCount != nil { feed.ItemCount = *itemCount } - if avgPostFreqHrs != nil { - feed.AvgPostFreqHrs = *avgPostFreqHrs - } feed.OldestItemDate = TimeValue(oldestItemDate) feed.NewestItemDate = TimeValue(newestItemDate) if noUpdate != nil { @@ -314,10 +284,9 @@ func (c *Crawler) GetAllFeeds() ([]*Feed, error) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds @@ -344,21 +313,20 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) { return count, err } -// GetFeedsDueForCheck returns feeds where next_crawl_at <= now, ordered randomly, limited to n +// GetFeedsDueForCheck returns feeds where next_crawl_at <= now, ordered by no_update desc (prioritize infrequent feeds) func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds - WHERE next_crawl_at <= NOW() AND status != 'dead' - ORDER BY RANDOM() + WHERE next_crawl_at <= NOW() AND status = 'pass' + ORDER BY no_update DESC LIMIT $1 `, limit) if err != nil { @@ -375,10 +343,9 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds WHERE source_host = $1 @@ -398,10 +365,9 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds @@ -424,9 +390,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { feed := &Feed{} var feedType, category, title, description, language, siteURL *string var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time - var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld *string - var ttlMinutes, updateFreq, errorCount, itemCount, noUpdate *int - var avgPostFreqHrs *float64 + var etag, lastModified, lastError, sourceURL, sourceHost, tld *string + var itemCount, noUpdate *int var status *string var publishStatus, publishAccount *string @@ -434,10 +399,9 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { &feed.URL, &feedType, &category, &title, &description, &language, &siteURL, &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, &etag, &lastModified, - &ttlMinutes, &updatePeriod, &updateFreq, - &status, &errorCount, &lastError, &lastErrorAt, + &status, &lastError, &lastErrorAt, &sourceURL, &sourceHost, &tld, - &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, ); err != nil { @@ -460,17 +424,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) - if ttlMinutes != nil { - feed.TTLMinutes = *ttlMinutes - } - feed.UpdatePeriod = StringValue(updatePeriod) - if updateFreq != nil { - feed.UpdateFreq = *updateFreq - } feed.Status = StringValue(status) - if errorCount != nil { - feed.ErrorCount = *errorCount - } feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) @@ -479,9 +433,6 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { if itemCount != nil { feed.ItemCount = *itemCount } - if avgPostFreqHrs != nil { - feed.AvgPostFreqHrs = *avgPostFreqHrs - } feed.OldestItemDate = TimeValue(oldestItemDate) feed.NewestItemDate = TimeValue(newestItemDate) if noUpdate != nil { @@ -522,10 +473,9 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds @@ -545,14 +495,13 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_crawled_at, next_crawl_at, last_build_date, etag, last_modified, - ttl_minutes, update_period, update_freq, - status, error_count, last_error, last_error_at, + status, last_error, last_error_at, source_url, source_host, tld, - item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, + item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account FROM feeds - WHERE publish_status = 'hold' AND item_count > 0 AND status = 'active' + WHERE publish_status = 'hold' AND item_count > 0 AND status = 'pass' ORDER BY item_count DESC LIMIT $1 `, limit) diff --git a/feed_check.go b/feed_check.go index 18093cd..92232fa 100644 --- a/feed_check.go +++ b/feed_check.go @@ -32,7 +32,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea Category: classifyFeed(feedURL), DiscoveredAt: now, LastCrawledAt: now, - Status: "active", + Status: "pass", SourceHost: sourceHost, TLD: getTLD(sourceHost), ETag: headers.Get("ETag"), @@ -88,7 +88,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { Type: feedType, Category: classifyFeed(feedURL), DiscoveredAt: now, - Status: "active", + Status: "pass", SourceURL: normalizeURL(sourceURL), SourceHost: sourceHost, TLD: getTLD(sourceHost), @@ -149,16 +149,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { } now := time.Now() feed.LastCrawledAt = now - feed.ErrorCount++ feed.NoUpdate++ feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = err.Error() feed.LastErrorAt = now - feed.Status = "error" - // Auto-hold feeds that fail 100+ times - if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" { + feed.Status = "hold" + // Auto-hold feeds after 1000 consecutive failures/no-changes + if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" { feed.PublishStatus = "hold" - fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL) + fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL) } c.saveFeed(feed) return false, err @@ -173,29 +172,28 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { feed.NoUpdate++ // Adaptive backoff: 100s base + 100s per consecutive no-change feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) - feed.ErrorCount = 0 feed.LastError = "" - feed.Status = "active" + feed.Status = "pass" + // Auto-hold feeds after 1000 consecutive no-changes + if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" { + feed.PublishStatus = "hold" + fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL) + } c.saveFeed(feed) return false, nil } // Non-200 response if resp.StatusCode != http.StatusOK { - feed.ErrorCount++ feed.NoUpdate++ feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = resp.Status feed.LastErrorAt = now - if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone { - feed.Status = "dead" - } else { - feed.Status = "error" - } - // Auto-hold feeds that fail 100+ times - if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" { + feed.Status = "hold" + // Auto-hold feeds after 1000 consecutive failures/no-changes + if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" { feed.PublishStatus = "hold" - fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL) + fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL) } c.saveFeed(feed) return false, nil @@ -204,16 +202,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { // 200 OK - feed has new content bodyBytes, err := io.ReadAll(resp.Body) if err != nil { - feed.ErrorCount++ feed.NoUpdate++ feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = err.Error() feed.LastErrorAt = now - feed.Status = "error" - // Auto-hold feeds that fail 100+ times - if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" { + feed.Status = "hold" + // Auto-hold feeds after 1000 consecutive failures/no-changes + if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" { feed.PublishStatus = "hold" - fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL) + fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL) } c.saveFeed(feed) return false, err @@ -242,9 +239,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { // Content changed - reset backoff feed.NoUpdate = 0 feed.NextCrawlAt = now.Add(100 * time.Second) - feed.ErrorCount = 0 feed.LastError = "" - feed.Status = "active" + feed.Status = "pass" c.saveFeed(feed) // Save items diff --git a/main.go b/main.go index 7bfe73d..7cb240b 100644 --- a/main.go +++ b/main.go @@ -30,7 +30,7 @@ func main() { go crawler.UpdateStats() // Start all loops independently - fmt.Println("Starting import, crawl, check, and stats loops...") + fmt.Println("Starting import, crawl, and stats loops...") // Import loop (background) - imports .com domains from vertices.txt.gz go crawler.ImportDomainsInBackground("vertices.txt.gz") @@ -56,10 +56,7 @@ func main() { // Publish loop (background) - autopublishes items for approved feeds go crawler.StartPublishLoop() - // Domain check loop (background) - verifies approved domains are reachable - go crawler.StartDomainCheckLoop() - - // Crawl loop (background) - crawls checked domains for feeds + // Crawl loop (background) - crawls approved domains for feeds go crawler.StartCrawlLoop() // Wait for shutdown signal diff --git a/parser.go b/parser.go index 5424dcc..02b2805 100644 --- a/parser.go +++ b/parser.go @@ -163,9 +163,6 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { feed.Description = ch.Description feed.Language = ch.Language feed.SiteURL = normalizeURL(ch.Link) - feed.TTLMinutes = ch.TTL - feed.UpdatePeriod = ch.UpdatePeriod - feed.UpdateFreq = ch.UpdateFreq feed.ItemCount = len(ch.Items) // Detect podcast @@ -251,10 +248,6 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { feed.OldestItemDate = oldest feed.NewestItemDate = newest - if len(dates) > 1 { - totalHours := newest.Sub(oldest).Hours() - feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) - } } return items @@ -367,10 +360,6 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item { feed.OldestItemDate = oldest feed.NewestItemDate = newest - if len(dates) > 1 { - totalHours := newest.Sub(oldest).Hours() - feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) - } } return items @@ -399,48 +388,8 @@ func parseRSSDate(s string) (time.Time, error) { // calculateNextCrawl determines when to next crawl this feed func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time { - now := time.Now() - - // If TTL is specified, use it - if feed.TTLMinutes > 0 { - return now.Add(time.Duration(feed.TTLMinutes) * time.Minute) - } - - // If updatePeriod is specified - if feed.UpdatePeriod != "" { - freq := feed.UpdateFreq - if freq == 0 { - freq = 1 - } - switch strings.ToLower(feed.UpdatePeriod) { - case "hourly": - return now.Add(time.Duration(freq) * time.Hour) - case "daily": - return now.Add(time.Duration(freq) * 24 * time.Hour) - case "weekly": - return now.Add(time.Duration(freq) * 7 * 24 * time.Hour) - case "monthly": - return now.Add(time.Duration(freq) * 30 * 24 * time.Hour) - case "yearly": - return now.Add(time.Duration(freq) * 365 * 24 * time.Hour) - } - } - - // If we have average post frequency, use that - if feed.AvgPostFreqHrs > 0 { - // Crawl at half the average frequency, but at least every hour and at most once per day - crawlInterval := feed.AvgPostFreqHrs / 2 - if crawlInterval < 1 { - crawlInterval = 1 - } - if crawlInterval > 24 { - crawlInterval = 24 - } - return now.Add(time.Duration(crawlInterval * float64(time.Hour))) - } - - // Default: crawl every 6 hours - return now.Add(6 * time.Hour) + // Adaptive backoff: 100s base + 100s per consecutive no-change + return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second) } // extractItemImages extracts image URLs from an RSS item @@ -661,10 +610,6 @@ func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item { feed.OldestItemDate = oldest feed.NewestItemDate = newest - if len(dates) > 1 { - totalHours := newest.Sub(oldest).Hours() - feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) - } } return items diff --git a/static/dashboard.js b/static/dashboard.js index 730b99d..88ad78c 100644 --- a/static/dashboard.js +++ b/static/dashboard.js @@ -91,16 +91,13 @@ function initDashboard() { ['Language', f.language], ['Site URL', f.siteUrl], ['Status', f.status], - ['Error Count', f.errorCount], ['Last Error', f.lastError], ['Item Count', f.itemCount], - ['Avg Post Freq', f.avgPostFreqHrs ? f.avgPostFreqHrs.toFixed(1) + ' hrs' : null], ['Oldest Item', f.oldestItemDate], ['Newest Item', f.newestItemDate], ['Discovered', f.discoveredAt], ['Last Crawled', f.lastCrawledAt], ['Next Crawl', f.nextCrawlAt], - ['TTL', f.ttlMinutes ? f.ttlMinutes + ' min' : null], ['Publish Status', f.publishStatus], ['Publish Account', f.publishAccount], ]; @@ -132,8 +129,8 @@ function initDashboard() { let html = ''; items.forEach(item => { const date = item.pub_date ? new Date(item.pub_date).toLocaleDateString() : ''; - html += `
`; - html += `${escapeHtml(date)}`; + html += `
`; + html += `
${escapeHtml(date)} 
`; if (item.link) { html += `${escapeHtml(item.title || item.link)}`; } else { @@ -151,44 +148,88 @@ function initDashboard() { const statusConfig = { hold: { color: '#f90', bg: '#330', border: '#550' }, skip: { color: '#f66', bg: '#400', border: '#600' }, - pass: { color: '#0f0', bg: '#040', border: '#060' }, - fail: { color: '#f00', bg: '#400', border: '#600' } + pass: { color: '#0f0', bg: '#040', border: '#060' } }; // Render status buttons - function renderStatusBtns(currentStatus, type, id, errorStatus) { + function renderStatusBtns(currentStatus, type, id) { const order = ['pass', 'hold', 'skip']; - const showFail = errorStatus === 'error' || errorStatus === 'dead'; let html = '
'; order.forEach((s, i) => { const cfg = statusConfig[s]; const isActive = s === currentStatus; - const bg = isActive ? cfg.bg : '#111'; + const bg = isActive ? cfg.bg : '#1a1a1a'; const border = isActive ? cfg.border : '#333'; - const color = isActive ? cfg.color : '#444'; + const color = isActive ? cfg.color : '#ccc'; html += ``; }); - if (showFail) { - const cfg = statusConfig.fail; - html += ``; - } html += '
'; return html; } + // Render TLD section header + function renderTLDHeader(tld) { + return `
+
+ + .${escapeHtml(tld)} +
+
`; + } + + function renderTLDFooter(tld) { + return ``; + } + + function closeTLDSection(container, tld) { + const tldContent = container.querySelector(`.tld-section[data-tld="${tld}"] .tld-content`); + if (tldContent) { + tldContent.insertAdjacentHTML('beforeend', renderTLDFooter(tld)); + } + } + + // Event delegation for TLD header/footer clicks (toggle section) + document.addEventListener('click', (e) => { + const tldHeader = e.target.closest('.tld-header'); + const tldFooter = e.target.closest('.tld-footer'); + if (tldHeader || tldFooter) { + const section = (tldHeader || tldFooter).closest('.tld-section'); + if (section) { + const content = section.querySelector('.tld-content'); + const toggle = section.querySelector('.tld-toggle'); + if (content) { + const isVisible = content.style.display !== 'none'; + content.style.display = isVisible ? 'none' : 'block'; + if (toggle) toggle.textContent = isVisible ? '▶' : '▼'; + + if (isVisible) { + // Closing - scroll to next TLD section + const nextSection = section.nextElementSibling; + if (nextSection && nextSection.classList.contains('tld-section')) { + nextSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); + } + } else { + // Opening - load domains if not already loaded + if (section.dataset.loaded === 'false') { + loadTLDDomains(section, searchQuery); + } + } + } + } + } + }); + // Render domain row with feeds function renderDomainRow(d) { const status = d.status || 'hold'; - const hasError = !!d.last_error; let html = `
`; html += `
`; - html += renderStatusBtns(status, 'domain', d.host, hasError ? 'error' : null); + html += renderStatusBtns(status, 'domain', d.host); html += `${escapeHtml(d.host)}`; if (d.last_error) { @@ -206,15 +247,11 @@ function initDashboard() { html += `
`; html += `
`; - const lang = f.language || ''; - html += `${escapeHtml(lang)}`; - html += renderStatusBtns(feedStatus, 'feed', f.url, f.status); - - const statusColor = f.status === 'active' ? '#484' : f.status === 'error' ? '#a66' : '#666'; - html += `${escapeHtml(f.status || 'active')}`; + html += `${escapeHtml(f.language || '')} `; + html += renderStatusBtns(feedStatus, 'feed', f.url); if (f.item_count > 0) { - html += `${commaFormat(f.item_count)}`; + html += `${commaFormat(f.item_count)}`; } else { html += ``; } @@ -235,6 +272,7 @@ function initDashboard() { html += '
'; html += '
'; }); + html += '
'; html += '
'; } html += '
'; @@ -285,7 +323,7 @@ function initDashboard() { infiniteScrollState = null; } - window.addEventListener('scroll', async () => { + async function checkInfiniteScroll() { if (!infiniteScrollState || infiniteScrollState.ended || isLoadingMore) return; const scrollY = window.scrollY + window.innerHeight; const docHeight = document.documentElement.scrollHeight; @@ -294,60 +332,119 @@ function initDashboard() { await infiniteScrollState.loadMore(); isLoadingMore = false; } - }); + } + + window.addEventListener('scroll', checkInfiniteScroll); + + // Load and display feeds with lazy-loading TLD sections + let tldObserver = null; - // Load and display feeds async function loadFeeds(query = '') { const output = document.getElementById('output'); - output.innerHTML = '
Loading...
'; + output.innerHTML = '
Loading TLDs...
'; - let offset = 0; - const limit = 100; + // Disconnect previous observer if any + if (tldObserver) { + tldObserver.disconnect(); + } - async function loadMore() { - try { - let url = `/api/domains?limit=${limit}&offset=${offset}&sort=alpha&has_feeds=true`; - if (query) { - url += `&search=${encodeURIComponent(query)}`; - } + try { + // Fetch all TLDs first + const tldsResp = await fetch('/api/tlds?has_feeds=true'); + const tlds = await tldsResp.json(); - const resp = await fetch(url); - const domains = await resp.json(); + if (!tlds || tlds.length === 0) { + document.getElementById('infiniteLoader').textContent = 'No feeds found'; + return; + } - if (!domains || domains.length === 0) { - if (infiniteScrollState) infiniteScrollState.ended = true; - document.getElementById('infiniteLoader').textContent = offset === 0 ? 'No feeds found' : 'End of list'; - return; - } + const container = output.querySelector('.domain-list'); - const container = output.querySelector('.domain-list'); - domains.forEach(d => { - container.insertAdjacentHTML('beforeend', renderDomainRow(d)); + // Render all TLD sections as collapsed placeholders + tlds.forEach(t => { + const tld = t.tld || 'unknown'; + container.insertAdjacentHTML('beforeend', ` +
+
+ + .${escapeHtml(tld)} + (${t.domain_count} domains) +
+ +
+ `); + }); + + document.getElementById('infiniteLoader').textContent = `${tlds.length} TLDs loaded`; + + // Set up IntersectionObserver for lazy loading (loads even when collapsed) + tldObserver = new IntersectionObserver((entries) => { + entries.forEach(entry => { + if (entry.isIntersecting) { + const section = entry.target; + if (section.dataset.loaded === 'false') { + loadTLDDomains(section, query); + tldObserver.unobserve(section); + } + } }); - attachStatusHandlers(container); + }, { rootMargin: '500px' }); + + // Observe all TLD sections + container.querySelectorAll('.tld-section').forEach(section => { + tldObserver.observe(section); + }); + + } catch (err) { + document.getElementById('infiniteLoader').textContent = 'Error: ' + err.message; + } + } + + // Load domains for a specific TLD section + async function loadTLDDomains(section, query = '') { + const tld = section.dataset.tld; + section.dataset.loaded = 'loading'; + + try { + let url = `/api/domains?has_feeds=true&tld=${encodeURIComponent(tld)}&limit=500`; + if (query) { + url += `&search=${encodeURIComponent(query)}`; + } + + const resp = await fetch(url); + const domains = await resp.json(); + + const content = section.querySelector('.tld-content'); + content.innerHTML = ''; + + if (!domains || domains.length === 0) { + content.innerHTML = '
No domains with feeds
'; + } else { + domains.forEach(d => { + content.insertAdjacentHTML('beforeend', renderDomainRow(d)); + }); + // Add footer + content.insertAdjacentHTML('beforeend', renderTLDFooter(tld)); + attachStatusHandlers(content); // Load items for all feeds - container.querySelectorAll('.inline-feed-block').forEach(feedBlock => { + content.querySelectorAll('.inline-feed-block').forEach(feedBlock => { const itemsDiv = feedBlock.querySelector('.feed-items'); if (itemsDiv && !itemsDiv.dataset.loaded) { itemsDiv.dataset.loaded = 'true'; loadFeedItems(feedBlock.dataset.url, itemsDiv); } }); - - offset += domains.length; - - if (domains.length < limit) { - if (infiniteScrollState) infiniteScrollState.ended = true; - document.getElementById('infiniteLoader').textContent = 'End of list'; - } - } catch (err) { - document.getElementById('infiniteLoader').textContent = 'Error: ' + err.message; } - } - await loadMore(); - setupInfiniteScroll(loadMore); + section.dataset.loaded = 'true'; + } catch (err) { + const content = section.querySelector('.tld-content'); + content.innerHTML = `
Error: ${escapeHtml(err.message)}
`; + section.dataset.loaded = 'false'; + } } // Search handler @@ -357,7 +454,6 @@ function initDashboard() { clearTimeout(searchTimeout); searchTimeout = setTimeout(() => { searchQuery = searchInput.value.trim(); - clearInfiniteScroll(); loadFeeds(searchQuery); }, 300); }); @@ -374,7 +470,6 @@ function initDashboard() { document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains); document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains); document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains); - document.getElementById('failDomains').textContent = commaFormat(stats.fail_domains); document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate); document.getElementById('checkRate').textContent = commaFormat(stats.check_rate); document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds); diff --git a/templates.go b/templates.go index b5023b4..87190cb 100644 --- a/templates.go +++ b/templates.go @@ -444,8 +444,9 @@ const dashboardHTML = ` 1440.news Feed Crawler + - +

1440.news Feed Crawler

@@ -468,10 +469,6 @@ const dashboardHTML = `
{{comma .SkipDomains}}
Skip
-
-
{{comma .FailDomains}}
-
Fail
-
{{comma .CrawlRate}}
crawls/min
@@ -502,16 +499,16 @@ const dashboardHTML = `
-
+
+ style="width: 100%; padding: 12px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff;">
-
v59
+
v100
Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}