diff --git a/.dockerignore b/.dockerignore index 2283752..78f4960 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,3 +6,4 @@ feeds/ .gitignore .claude CLAUDE.md +.launch.sh diff --git a/CLAUDE.md b/CLAUDE.md index 1b5f630..1c2ec06 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. -> **Note:** Always run applications in containers via `docker compose up -d --build` when possible. This ensures proper networking between services (database, traefik, etc.) and matches the production environment. +> **IMPORTANT:** Always use `./.launch.sh` to deploy changes. This script updates version numbers in static files (CSS/JS cache busting) before running `docker compose up -d --build`. Never use `docker compose` directly. ## Build & Run @@ -84,17 +84,23 @@ PostgreSQL with pgx driver, using connection pooling: Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`) -### Crawl Logic +### Processing Terminology -1. Domains import as `pass` by default (auto-crawled) -2. Crawl loop picks up domains where `last_crawled_at IS NULL` -3. Full recursive crawl (HTTPS, fallback HTTP) up to MaxDepth=10, MaxPagesPerHost=10 +- **domain_check**: DNS lookup to verify domain is live +- **feed_crawl**: Crawl a live domain to discover RSS/Atom feeds +- **feed_check**: Check a known feed for new items + +### Domain Processing Flow + +1. Domains import as `pass` by default +2. Domain loop runs **domain_check** (DNS lookup) for unchecked domains +3. Domain loop runs **feed_crawl** for checked domains (recursive crawl up to MaxDepth=10, MaxPagesPerHost=10) 4. Extract `` and anchor hrefs containing rss/atom/feed -5. Parse discovered feeds for metadata, save with next_crawl_at +5. Parse discovered feeds for metadata, save with `next_check_at` ### Feed Checking -Uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 100s per consecutive no-change. Respects RSS `` and Syndication namespace hints. +**feed_check** uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 100s per consecutive no-change. Respects RSS `` and Syndication namespace hints. ### Publishing diff --git a/Dockerfile b/Dockerfile index c6d849c..68d7fb2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ -FROM golang:1.24-alpine AS builder +FROM golang:latest AS builder WORKDIR /app # Install build dependencies -RUN apk add --no-cache gcc musl-dev +RUN apt-get update && apt-get install -y gcc && rm -rf /var/lib/apt/lists/* # Copy go mod files first for layer caching COPY go.mod go.sum ./ @@ -17,12 +17,12 @@ COPY static/ ./static/ RUN CGO_ENABLED=1 go build -o 1440.news . # Runtime stage -FROM alpine:latest +FROM ubuntu:latest WORKDIR /app # Install runtime dependencies -RUN apk add --no-cache ca-certificates tzdata +RUN apt-get update && apt-get install -y ca-certificates tzdata && rm -rf /var/lib/apt/lists/* # Copy binary from builder COPY --from=builder /app/1440.news . diff --git a/api_feeds.go b/api_feeds.go index 8b1db85..2467f1d 100644 --- a/api_feeds.go +++ b/api_feeds.go @@ -26,8 +26,8 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { Language string `json:"language,omitempty"` SiteURL string `json:"siteUrl,omitempty"` DiscoveredAt string `json:"discoveredAt,omitempty"` - LastCrawledAt string `json:"lastCrawledAt,omitempty"` - NextCrawlAt string `json:"nextCrawlAt,omitempty"` + LastCheckedAt string `json:"lastCheckedAt,omitempty"` + NextCheckAt string `json:"nextCheckAt,omitempty"` LastBuildDate string `json:"lastBuildDate,omitempty"` Status string `json:"status,omitempty"` LastError string `json:"lastError,omitempty"` @@ -40,7 +40,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { var f FeedDetails var category, title, description, language, siteUrl *string - var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time + var lastCheckedAt, nextCheckAt, lastBuildDate *time.Time var status, lastError *string var oldestItemDate, newestItemDate *time.Time var itemCount *int @@ -49,7 +49,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { err := c.db.QueryRow(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, (SELECT COUNT(*) FROM items WHERE feed_url = feeds.url) as item_count, oldest_item_date, newest_item_date, @@ -57,7 +57,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { FROM feeds WHERE url = $1 `, feedURL).Scan( &f.URL, &f.Type, &category, &title, &description, &language, &siteUrl, - &discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &discoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &status, &lastError, &itemCount, &oldestItemDate, &newestItemDate, &publishStatus, &publishAccount, @@ -78,11 +78,11 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { f.Language = StringValue(language) f.SiteURL = StringValue(siteUrl) f.DiscoveredAt = discoveredAt.Format(time.RFC3339) - if lastCrawledAt != nil { - f.LastCrawledAt = lastCrawledAt.Format(time.RFC3339) + if lastCheckedAt != nil { + f.LastCheckedAt = lastCheckedAt.Format(time.RFC3339) } - if nextCrawlAt != nil { - f.NextCrawlAt = nextCrawlAt.Format(time.RFC3339) + if nextCheckAt != nil { + f.NextCheckAt = nextCheckAt.Format(time.RFC3339) } if lastBuildDate != nil { f.LastBuildDate = lastBuildDate.Format(time.RFC3339) diff --git a/api_search.go b/api_search.go index 2dc77c3..c457ada 100644 --- a/api_search.go +++ b/api_search.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "strings" "time" "github.com/jackc/pgx/v5" @@ -16,16 +17,16 @@ type SearchResult struct { } type SearchFeed struct { - URL string `json:"url"` - Type string `json:"type"` - Category string `json:"category"` - Title string `json:"title"` - Description string `json:"description"` - Language string `json:"language"` - SiteURL string `json:"site_url"` - DiscoveredAt string `json:"discovered_at"` - LastCrawledAt string `json:"last_crawled_at"` - NextCrawlAt string `json:"next_crawl_at"` + URL string `json:"url"` + Type string `json:"type"` + Category string `json:"category"` + Title string `json:"title"` + Description string `json:"description"` + Language string `json:"language"` + SiteURL string `json:"site_url"` + DiscoveredAt string `json:"discovered_at"` + LastCheckedAt string `json:"last_checked_at"` + NextCheckAt string `json:"next_check_at"` LastBuildDate string `json:"last_build_date"` Status string `json:"status"` LastError string `json:"last_error"` @@ -76,7 +77,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { var url string var feedType, category, title, description, language, siteUrl *string var discoveredAt time.Time - var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time + var lastCheckedAt, nextCheckAt, lastBuildDate *time.Time var itemCount *int var status, lastError *string var lastErrorAt *time.Time @@ -85,7 +86,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { var noUpdate *bool if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl, - &discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &discoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &status, &lastError, &lastErrorAt, &sourceUrl, &sourceHost, &tld, &itemCount, &oldestItemDate, &newestItemDate, &noUpdate); err != nil { @@ -110,11 +111,11 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { SourceHost: StringValue(sourceHost), TLD: StringValue(tld), } - if lastCrawledAt != nil { - sf.LastCrawledAt = lastCrawledAt.Format(time.RFC3339) + if lastCheckedAt != nil { + sf.LastCheckedAt = lastCheckedAt.Format(time.RFC3339) } - if nextCrawlAt != nil { - sf.NextCrawlAt = nextCrawlAt.Format(time.RFC3339) + if nextCheckAt != nil { + sf.NextCheckAt = nextCheckAt.Format(time.RFC3339) } if lastBuildDate != nil { sf.LastBuildDate = lastBuildDate.Format(time.RFC3339) @@ -138,16 +139,18 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { } // Search feeds by source_host (LIKE search for domain matching) + // Use LOWER() to leverage trigram index + lowerPattern := "%" + strings.ToLower(query) + "%" hostRows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, source_url, source_host, tld, item_count, oldest_item_date, newest_item_date, no_update FROM feeds - WHERE source_host ILIKE $1 OR url ILIKE $1 + WHERE LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 LIMIT $2 - `, "%"+query+"%", limit) + `, lowerPattern, limit) if err == nil { defer hostRows.Close() for hostRows.Next() { @@ -163,7 +166,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { tsQuery := ToSearchQuery(query) feedRows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, source_url, source_host, tld, item_count, oldest_item_date, newest_item_date, no_update @@ -228,7 +231,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { // Fetch feed info for this item's feed var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string var fDiscoveredAt time.Time - var fLastCrawledAt, fNextCrawlAt, fLastBuildDate *time.Time + var fLastCheckedAt, fNextCheckAt, fLastBuildDate *time.Time var fItemCount *int var fStatus, fLastError *string var fLastErrorAt *time.Time @@ -238,13 +241,13 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { c.db.QueryRow(` SELECT type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, source_url, source_host, tld, item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE url = $1 `, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl, - &fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate, + &fDiscoveredAt, &fLastCheckedAt, &fNextCheckAt, &fLastBuildDate, &fStatus, &fLastError, &fLastErrorAt, &fSourceUrl, &fSourceHost, &fTLD, &fItemCount, &fOldestItemDate, &fNewestItemDate, &fNoUpdate) @@ -268,11 +271,11 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { SourceHost: StringValue(fSourceHost), TLD: StringValue(fTLD), } - if fLastCrawledAt != nil { - sf.LastCrawledAt = fLastCrawledAt.Format(time.RFC3339) + if fLastCheckedAt != nil { + sf.LastCheckedAt = fLastCheckedAt.Format(time.RFC3339) } - if fNextCrawlAt != nil { - sf.NextCrawlAt = fNextCrawlAt.Format(time.RFC3339) + if fNextCheckAt != nil { + sf.NextCheckAt = fNextCheckAt.Format(time.RFC3339) } if fLastBuildDate != nil { sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339) diff --git a/crawler.go b/crawler.go index 9cbf157..f452a21 100644 --- a/crawler.go +++ b/crawler.go @@ -1,9 +1,11 @@ package main import ( + "context" "encoding/json" "fmt" "io" + "net" "net/http" "os" "strings" @@ -15,23 +17,22 @@ import ( ) type Crawler struct { - MaxDepth int - MaxPagesPerHost int - Timeout time.Duration - UserAgent string - visited sync.Map - feedsMu sync.Mutex - client *http.Client - hostsProcessed int32 - feedsChecked int32 - startTime time.Time - db *DB - displayedCrawlRate int - displayedCheckRate int - domainsImported int32 - cachedStats *DashboardStats - cachedAllDomains []DomainStat - statsMu sync.RWMutex + MaxDepth int + MaxPagesPerHost int + Timeout time.Duration + UserAgent string + visited sync.Map + feedsMu sync.Mutex + client *http.Client + domainsCrawled int32 // feed_crawl: domains crawled for feed discovery + domainsChecked int32 // domain_check: domains checked for liveness + feedsChecked int32 // feed_check: feeds checked for new items + startTime time.Time + db *DB + domainsImported int32 + cachedStats *DashboardStats + cachedAllDomains []DomainStat + statsMu sync.RWMutex } func NewCrawler(connString string) (*Crawler, error) { @@ -467,43 +468,92 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { return items, nil } -// StartCrawlLoop runs the domain crawling loop independently -func (c *Crawler) StartCrawlLoop() { - numWorkers := 100 +// dnsResolver uses local caching DNS (infra-dns) with fallback to system +var dnsResolver = &net.Resolver{ + PreferGo: true, + Dial: func(ctx context.Context, network, address string) (net.Conn, error) { + d := net.Dialer{Timeout: 2 * time.Second} + // Try local caching DNS first (CoreDNS on proxy network) + conn, err := d.DialContext(ctx, "udp", "infra-dns:53") + if err == nil { + return conn, nil + } + // Fallback to system DNS + return d.DialContext(ctx, network, address) + }, +} + +// domainCheck performs a DNS lookup to check if a domain resolves +func (c *Crawler) domainCheck(host string) error { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, err := dnsResolver.LookupHost(ctx, host) + return err +} + +// StartDomainLoop runs the domain processing loop (domain_check + feed_crawl) +func (c *Crawler) StartDomainLoop() { + numWorkers := 1000 // Buffered channel for domain work - workChan := make(chan *Domain, 100) + workChan := make(chan *Domain, 1000) // Start workers for i := 0; i < numWorkers; i++ { go func() { for domain := range workChan { - feedsFound, crawlErr := c.crawlHost(domain.Host) - errStr := "" - if crawlErr != nil { - errStr = crawlErr.Error() - } - if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil { - fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err) + fh := domain.FullHost() + if domain.CrawledAt.Equal(DomainStateUnchecked) { + // domain_check: DNS lookup for liveness + err := c.domainCheck(fh) + errStr := "" + if err != nil { + errStr = err.Error() + } + if err := c.markDomainChecked(domain.Host, domain.TLD, errStr); err != nil { + fmt.Printf("Error marking domain %s as checked: %v\n", fh, err) + } + atomic.AddInt32(&c.domainsChecked, 1) + } else { + // feed_crawl: crawl domain to discover feeds + feedsFound, crawlErr := c.feedCrawl(fh) + errStr := "" + if crawlErr != nil { + errStr = crawlErr.Error() + } + if err := c.markDomainCrawled(domain.Host, domain.TLD, feedsFound, errStr); err != nil { + fmt.Printf("Error marking domain %s as crawled: %v\n", fh, err) + } + atomic.AddInt32(&c.domainsCrawled, 1) } } }() } - const fetchSize = 100 + const fetchSize = 1000 for { - domains, err := c.GetDomainsToCrawl(fetchSize) + domains, err := c.GetDomainsToProcess(fetchSize) if err != nil { - fmt.Printf("Error fetching domains to crawl: %v\n", err) + fmt.Printf("Error fetching domains to process: %v\n", err) } if len(domains) == 0 { - c.displayedCrawlRate = 0 time.Sleep(1 * time.Second) continue } - fmt.Printf("%s crawl: %d domains to crawl\n", time.Now().Format("15:04:05"), len(domains)) + // Count unchecked vs checked for logging + unchecked := 0 + for _, d := range domains { + if d.CrawledAt.Equal(DomainStateUnchecked) { + unchecked++ + } + } + checked := len(domains) - unchecked + + if unchecked > 0 || checked > 0 { + fmt.Printf("%s domain: %d domain_check, %d feed_crawl\n", time.Now().Format("15:04:05"), unchecked, checked) + } for _, domain := range domains { workChan <- domain @@ -513,12 +563,12 @@ func (c *Crawler) StartCrawlLoop() { } } -// StartCheckLoop runs the feed checking loop independently -func (c *Crawler) StartCheckLoop() { - numWorkers := 100 +// StartFeedCheckLoop runs the feed_check loop (checking feeds for new items) +func (c *Crawler) StartFeedCheckLoop() { + numWorkers := 1000 // Buffered channel for feed work - workChan := make(chan *Feed, 100) + workChan := make(chan *Feed, 1000) // Start workers for i := 0; i < numWorkers; i++ { @@ -537,12 +587,11 @@ func (c *Crawler) StartCheckLoop() { } if len(feeds) == 0 { - c.displayedCheckRate = 0 time.Sleep(1 * time.Second) continue } - fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds)) + fmt.Printf("%s feed_check: %d feeds\n", time.Now().Format("15:04:05"), len(feeds)) for _, feed := range feeds { workChan <- feed @@ -552,8 +601,9 @@ func (c *Crawler) StartCheckLoop() { } } -func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { - atomic.AddInt32(&c.hostsProcessed, 1) +// feedCrawl crawls a domain to discover RSS/Atom feeds +func (c *Crawler) feedCrawl(host string) (feedsFound int, err error) { + atomic.AddInt32(&c.domainsCrawled, 1) localVisited := make(map[string]bool) pagesVisited := 0 diff --git a/dashboard.go b/dashboard.go index 4cda8f2..1265623 100644 --- a/dashboard.go +++ b/dashboard.go @@ -12,17 +12,26 @@ type DashboardStats struct { HoldDomains int `json:"hold_domains"` PassDomains int `json:"pass_domains"` SkipDomains int `json:"skip_domains"` + DeadDomains int `json:"dead_domains"` // Feed stats TotalFeeds int `json:"total_feeds"` + AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds) + PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing) + SkipFeeds int `json:"skip_feeds"` + HoldFeeds int `json:"hold_feeds"` + DeadFeeds int `json:"dead_feeds"` + EmptyFeeds int `json:"empty_feeds"` RSSFeeds int `json:"rss_feeds"` AtomFeeds int `json:"atom_feeds"` + JSONFeeds int `json:"json_feeds"` UnknownFeeds int `json:"unknown_feeds"` - // Crawl progress - HostsProcessed int32 `json:"hosts_processed"` - CrawlRate int `json:"crawl_rate"` // crawls per minute - CheckRate int `json:"check_rate"` // feed checks per minute + // Processing rates (per minute) + DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count + DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute + FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute + FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute // Timing UpdatedAt time.Time `json:"updated_at"` @@ -122,28 +131,15 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { func (c *Crawler) calculateStats() (*DashboardStats, error) { stats := &DashboardStats{ UpdatedAt: time.Now(), - HostsProcessed: c.hostsProcessed, + DomainsCrawled: c.domainsCrawled, } - // Calculate crawl rate (crawls per minute), smoothed by +/-1 per update + // Calculate rates (per minute) elapsed := time.Since(c.startTime).Minutes() if elapsed > 0 { - actualRate := int(float64(c.hostsProcessed) / elapsed) - if actualRate > c.displayedCrawlRate { - c.displayedCrawlRate++ - } else if actualRate < c.displayedCrawlRate { - c.displayedCrawlRate-- - } - stats.CrawlRate = c.displayedCrawlRate - - // Calculate check rate (feed checks per minute), smoothed by +/-1 per update - actualCheckRate := int(float64(c.feedsChecked) / elapsed) - if actualCheckRate > c.displayedCheckRate { - c.displayedCheckRate++ - } else if actualCheckRate < c.displayedCheckRate { - c.displayedCheckRate-- - } - stats.CheckRate = c.displayedCheckRate + stats.DomainCheckRate = int(float64(c.domainsChecked) / elapsed) + stats.FeedCrawlRate = int(float64(c.domainsCrawled) / elapsed) + stats.FeedCheckRate = int(float64(c.feedsChecked) / elapsed) } // Get domain stats @@ -186,6 +182,8 @@ func (c *Crawler) collectDomainStats(stats *DashboardStats) error { stats.PassDomains = count case "skip": stats.SkipDomains = count + case "dead": + stats.DeadDomains = count } } if err := rows.Err(); err != nil { @@ -202,6 +200,39 @@ func (c *Crawler) collectFeedStats(stats *DashboardStats) error { return err } + // Get status counts + statusRows, err := c.db.Query("SELECT status, COUNT(*) FROM feeds GROUP BY status") + if err != nil { + return err + } + defer statusRows.Close() + + for statusRows.Next() { + var status *string + var count int + if err := statusRows.Scan(&status, &count); err != nil { + continue + } + if status != nil { + switch *status { + case "pass": + stats.AliveFeeds = count + case "skip": + stats.SkipFeeds = count + case "hold": + stats.HoldFeeds = count + case "dead": + stats.DeadFeeds = count + } + } + } + + // Count feeds approved for publishing (publish_status='pass') + c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE publish_status = 'pass'").Scan(&stats.PublishFeeds) + + // Count empty feeds (item_count = 0 or NULL) + c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE item_count IS NULL OR item_count = 0").Scan(&stats.EmptyFeeds) + // Single query to get all type counts (one index scan instead of three) rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type") if err != nil { @@ -223,6 +254,8 @@ func (c *Crawler) collectFeedStats(stats *DashboardStats) error { stats.RSSFeeds = count case "atom": stats.AtomFeeds = count + case "json": + stats.JSONFeeds = count default: stats.UnknownFeeds += count } diff --git a/docker-compose.yml b/docker-compose.yml index 15c6ce6..e8265d6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,14 +1,15 @@ services: app-1440-news: build: . - container_name: app-1440-news + image: atproto-1440news-app + container_name: atproto-1440news-app restart: unless-stopped stop_grace_period: 30s env_file: - pds.env - oauth.env environment: - DB_HOST: atproto-postgres + DB_HOST: infra-postgres DB_PORT: 5432 DB_USER: news_1440 DB_PASSWORD_FILE: /run/secrets/db_password @@ -54,7 +55,7 @@ services: secrets: db_password: - file: ../postgres/secrets/news_1440_password.txt + file: ../../../infra/postgres/secrets/news_1440_password.txt networks: proxy: diff --git a/domain.go b/domain.go index 2eba749..9e8767d 100644 --- a/domain.go +++ b/domain.go @@ -14,19 +14,38 @@ import ( "github.com/jackc/pgx/v5" ) -// Domain represents a host to be crawled for feeds +// Domain represents a host to process for feeds // Status: hold (pending review), pass (approved), skip (not processing) +// CrawledAt: zero time = needs domain_check, +1 sec = needs feed_crawl, real time = done type Domain struct { - Host string `json:"host"` - Status string `json:"status"` - DiscoveredAt time.Time `json:"discovered_at"` - LastCheckedAt time.Time `json:"last_checked_at,omitempty"` - LastCrawledAt time.Time `json:"last_crawled_at,omitempty"` - FeedsFound int `json:"feeds_found,omitempty"` - LastError string `json:"last_error,omitempty"` - TLD string `json:"tld,omitempty"` + Host string `json:"host"` + Status string `json:"status"` + CrawledAt time.Time `json:"crawled_at"` + FeedsFound int `json:"feeds_found,omitempty"` + LastError string `json:"last_error,omitempty"` + TLD string `json:"tld,omitempty"` + MissCount int `json:"miss_count,omitempty"` } +// MissCountThreshold is the number of consecutive errors before setting status to hold +const MissCountThreshold = 100 + +// ErrorRetryDelay is how long to wait before retrying a domain with errors (1 hour minimum) +// At 100 seconds actual rate due to queue, 100 misses = ~2.8 hours +// At 1 hour minimum delay, 100 misses = ~4+ days in practice +var ErrorRetryDelay = 1 * time.Hour + +// FullHost returns the complete hostname (host + tld) +func (d *Domain) FullHost() string { + return fullHost(d.Host, d.TLD) +} + +// Sentinel values for domain processing state +var ( + DomainStateUnchecked = time.Time{} // 0001-01-01 00:00:00 - needs domain_check + DomainStateChecked = time.Time{}.Add(time.Second) // 0001-01-01 00:00:01 - needs feed_crawl +) + // shouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns func shouldAutoSkipDomain(host string) bool { // Never skip our own domain @@ -51,62 +70,63 @@ func shouldAutoSkipDomain(host string) bool { // saveDomain stores a domain in PostgreSQL func (c *Crawler) saveDomain(domain *Domain) error { // Auto-skip domains matching spam patterns + fh := domain.FullHost() status := domain.Status - if shouldAutoSkipDomain(domain.Host) { + if shouldAutoSkipDomain(fh) { status = "skip" } _, err := c.db.Exec(` - INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - ON CONFLICT(host) DO UPDATE SET + INSERT INTO domains (host, status, crawled_at, feeds_found, last_error, tld) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT(host, tld) DO UPDATE SET status = EXCLUDED.status, - last_checked_at = EXCLUDED.last_checked_at, - last_crawled_at = EXCLUDED.last_crawled_at, + crawled_at = EXCLUDED.crawled_at, feeds_found = EXCLUDED.feeds_found, - last_error = EXCLUDED.last_error, - tld = EXCLUDED.tld - `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt), - NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD) + last_error = EXCLUDED.last_error + `, stripTLD(fh), status, domain.CrawledAt, + domain.FeedsFound, NullableString(domain.LastError), domain.TLD) return err } // saveDomainTx stores a domain using a transaction func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error { // Auto-skip domains matching spam patterns + fh := domain.FullHost() status := domain.Status - if shouldAutoSkipDomain(domain.Host) { + if shouldAutoSkipDomain(fh) { status = "skip" } _, err := tx.Exec(context.Background(), ` - INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - ON CONFLICT(host) DO NOTHING - `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt), - NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD) + INSERT INTO domains (host, status, crawled_at, feeds_found, last_error, tld) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT(host, tld) DO NOTHING + `, stripTLD(fh), status, domain.CrawledAt, + domain.FeedsFound, NullableString(domain.LastError), domain.TLD) return err } // domainExists checks if a domain already exists in the database func (c *Crawler) domainExists(host string) bool { + host = normalizeHost(host) var exists bool - err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1)", normalizeHost(host)).Scan(&exists) + err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1 AND tld = $2)", stripTLD(host), getTLD(host)).Scan(&exists) return err == nil && exists } // getDomain retrieves a domain from PostgreSQL func (c *Crawler) getDomain(host string) (*Domain, error) { + host = normalizeHost(host) domain := &Domain{} - var lastCheckedAt, lastCrawledAt *time.Time var lastError *string err := c.db.QueryRow(` - SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld - FROM domains WHERE host = $1 - `, normalizeHost(host)).Scan( - &domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt, - &domain.FeedsFound, &lastError, &domain.TLD, + SELECT host, tld, status, crawled_at, feeds_found, last_error + FROM domains WHERE host = $1 AND tld = $2 + `, stripTLD(host), getTLD(host)).Scan( + &domain.Host, &domain.TLD, &domain.Status, &domain.CrawledAt, + &domain.FeedsFound, &lastError, ) if err == pgx.ErrNoRows { @@ -116,21 +136,26 @@ func (c *Crawler) getDomain(host string) (*Domain, error) { return nil, err } - domain.LastCheckedAt = TimeValue(lastCheckedAt) - domain.LastCrawledAt = TimeValue(lastCrawledAt) domain.LastError = StringValue(lastError) return domain, nil } -// GetDomainsToCrawl returns domains ready for crawling (status='pass', not yet crawled) -func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) { +// GetDomainsToProcess returns domains needing processing (domain_check or feed_crawl) +// crawled_at = zero time means needs domain_check, +1 sec means needs feed_crawl +// Domains with errors are retried when crawled_at < now (scheduled by ErrorRetryDelay) +func (c *Crawler) GetDomainsToProcess(limit int) ([]*Domain, error) { + now := time.Now() rows, err := c.db.Query(` - SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld - FROM domains WHERE status = 'pass' AND last_crawled_at IS NULL - ORDER BY discovered_at DESC - LIMIT $1 - `, limit) + SELECT host, status, crawled_at, feeds_found, last_error, tld + FROM domains + WHERE status = 'pass' AND ( + (crawled_at < '0001-01-02' AND last_error IS NULL) -- new domains + OR (crawled_at < $1 AND last_error IS NOT NULL) -- retry errors after delay + ) + ORDER BY crawled_at ASC + LIMIT $2 + `, now, limit) if err != nil { return nil, err } @@ -139,23 +164,45 @@ func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) { return c.scanDomains(rows) } +// markDomainChecked updates a domain after domain_check (sets to +1 sec for feed_crawl) +// host parameter should be the stripped host (without TLD) +func (c *Crawler) markDomainChecked(host, tld, lastError string) error { + if lastError != "" { + // Increment miss_count, set to 'hold' only at threshold + // Schedule retry after ErrorRetryDelay + retryAt := time.Now().Add(ErrorRetryDelay) + _, err := c.db.Exec(` + UPDATE domains SET + crawled_at = $1, + last_error = $2, + miss_count = miss_count + 1, + status = CASE WHEN miss_count + 1 >= $3 THEN 'hold' ELSE status END + WHERE host = $4 AND tld = $5 + `, retryAt, lastError, MissCountThreshold, host, tld) + return err + } + // Success - reset miss_count + _, err := c.db.Exec(` + UPDATE domains SET crawled_at = $1, last_error = NULL, miss_count = 0 + WHERE host = $2 AND tld = $3 + `, DomainStateChecked, host, tld) + return err +} + // scanDomains is a helper to scan multiple domain rows func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) { var domains []*Domain for rows.Next() { domain := &Domain{} - var lastCheckedAt, lastCrawledAt *time.Time var lastError *string if err := rows.Scan( - &domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt, + &domain.Host, &domain.Status, &domain.CrawledAt, &domain.FeedsFound, &lastError, &domain.TLD, ); err != nil { continue } - domain.LastCheckedAt = TimeValue(lastCheckedAt) - domain.LastCrawledAt = TimeValue(lastCrawledAt) domain.LastError = StringValue(lastError) domains = append(domains, domain) @@ -164,20 +211,30 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) { return domains, rows.Err() } -// markDomainCrawled updates a domain after the crawl stage -func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error { - now := time.Now() +// markDomainCrawled updates a domain after feed_crawl (sets to NOW()) +// host parameter should be the stripped host (without TLD) +func (c *Crawler) markDomainCrawled(host, tld string, feedsFound int, lastError string) error { if lastError != "" { + // Increment miss_count, set to 'hold' only at threshold + // Schedule retry after ErrorRetryDelay + retryAt := time.Now().Add(ErrorRetryDelay) _, err := c.db.Exec(` - UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = $3 - WHERE host = $4 - `, now, feedsFound, lastError, normalizeHost(host)) + UPDATE domains SET + crawled_at = $1, + feeds_found = $2, + last_error = $3, + miss_count = miss_count + 1, + status = CASE WHEN miss_count + 1 >= $4 THEN 'hold' ELSE status END + WHERE host = $5 AND tld = $6 + `, retryAt, feedsFound, lastError, MissCountThreshold, host, tld) return err } + // Success - reset miss_count + now := time.Now() _, err := c.db.Exec(` - UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = NULL - WHERE host = $3 - `, now, feedsFound, normalizeHost(host)) + UPDATE domains SET crawled_at = $1, feeds_found = $2, last_error = NULL, miss_count = 0 + WHERE host = $3 AND tld = $4 + `, now, feedsFound, host, tld) return err } @@ -193,13 +250,13 @@ func (c *Crawler) GetDomainCount() (total int, hold int, err error) { // ImportTestDomains adds a list of specific domains for testing func (c *Crawler) ImportTestDomains(domains []string) { - now := time.Now() for _, host := range domains { + host = normalizeHost(host) _, err := c.db.Exec(` - INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, 'pass', $2, $3) - ON CONFLICT(host) DO NOTHING - `, host, now, getTLD(host)) + INSERT INTO domains (host, status, tld) + VALUES ($1, 'pass', $2) + ON CONFLICT(host, tld) DO NOTHING + `, stripTLD(host), getTLD(host)) if err != nil { fmt.Printf("Error adding test domain %s: %v\n", host, err) } else { @@ -255,7 +312,6 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { scanner.Buffer(buf, 1024*1024) const batchSize = 100 - now := time.Now() totalImported := 0 batchCount := 0 @@ -299,14 +355,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { if shouldAutoSkipDomain(d.host) { status = "skip" } - rows[i] = []interface{}{d.host, status, now, d.tld} + rows[i] = []interface{}{stripTLD(d.host), status, d.tld} } // Use CopyFrom for bulk insert imported, err := conn.CopyFrom( ctx, pgx.Identifier{"domains"}, - []string{"host", "status", "discovered_at", "tld"}, + []string{"host", "status", "tld"}, pgx.CopyFromRows(rows), ) conn.Release() @@ -319,10 +375,10 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { status = "skip" } c.db.Exec(` - INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, $2, $3, $4) - ON CONFLICT(host) DO NOTHING - `, d.host, status, now, d.tld) + INSERT INTO domains (host, status, tld) + VALUES ($1, $2, $3) + ON CONFLICT(host, tld) DO NOTHING + `, stripTLD(d.host), status, d.tld) } imported = int64(len(domains)) } @@ -369,7 +425,6 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in buf := make([]byte, 0, 64*1024) scanner.Buffer(buf, 1024*1024) - now := time.Now() count := 0 const batchSize = 100 @@ -408,10 +463,10 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in status = "skip" } result, err := c.db.Exec(` - INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, $2, $3, $4) - ON CONFLICT(host) DO NOTHING - `, d.host, status, now, d.tld) + INSERT INTO domains (host, status, tld) + VALUES ($1, $2, $3) + ON CONFLICT(host, tld) DO NOTHING + `, stripTLD(d.host), status, d.tld) if err != nil { skipped++ } else if result > 0 { diff --git a/feed.go b/feed.go index 83cadd6..083f4fa 100644 --- a/feed.go +++ b/feed.go @@ -101,8 +101,8 @@ type Feed struct { // Timing DiscoveredAt time.Time `json:"discovered_at"` - LastCrawledAt time.Time `json:"last_crawled_at,omitempty"` - NextCrawlAt time.Time `json:"next_crawl_at,omitempty"` + LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked + NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated // Cache headers for conditional requests @@ -120,7 +120,7 @@ type Feed struct { TLD string `json:"tld,omitempty"` // Content stats - ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl + ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check OldestItemDate time.Time `json:"oldest_item_date,omitempty"` NewestItemDate time.Time `json:"newest_item_date,omitempty"` @@ -153,7 +153,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { _, err := c.db.Exec(` INSERT INTO feeds ( url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -168,8 +168,8 @@ func (c *Crawler) saveFeed(feed *Feed) error { description = EXCLUDED.description, language = EXCLUDED.language, site_url = EXCLUDED.site_url, - last_crawled_at = EXCLUDED.last_crawled_at, - next_crawl_at = EXCLUDED.next_crawl_at, + last_checked_at = EXCLUDED.last_checked_at, + next_check_at = EXCLUDED.next_check_at, last_build_date = EXCLUDED.last_build_date, etag = EXCLUDED.etag, last_modified = EXCLUDED.last_modified, @@ -185,7 +185,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { `, feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description), NullableString(feed.Language), NullableString(feed.SiteURL), - feed.DiscoveredAt, NullableTime(feed.LastCrawledAt), NullableTime(feed.NextCrawlAt), NullableTime(feed.LastBuildDate), + feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate), NullableString(feed.ETag), NullableString(feed.LastModified), feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt), NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD), @@ -200,14 +200,14 @@ func (c *Crawler) saveFeed(feed *Feed) error { func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed := &Feed{} var category, title, description, language, siteURL *string - var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time + var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time var etag, lastModified, lastError, sourceURL, sourceHost, tld *string var publishStatus, publishAccount *string var itemCount, noUpdate *int err := c.db.QueryRow(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -217,7 +217,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { FROM feeds WHERE url = $1 `, normalizeURL(feedURL)).Scan( &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL, - &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &feed.Status, &lastError, &lastErrorAt, &sourceURL, &sourceHost, &tld, @@ -243,8 +243,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed.Description = StringValue(description) feed.Language = StringValue(language) feed.SiteURL = StringValue(siteURL) - feed.LastCrawledAt = TimeValue(lastCrawledAt) - feed.NextCrawlAt = TimeValue(nextCrawlAt) + feed.LastCheckedAt = TimeValue(lastCheckedAt) + feed.NextCheckAt = TimeValue(nextCheckAt) feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) @@ -282,7 +282,7 @@ func (c *Crawler) feedExists(feedURL string) bool { func (c *Crawler) GetAllFeeds() ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -313,11 +313,11 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) { return count, err } -// GetFeedsDueForCheck returns feeds where next_crawl_at <= now, ordered by no_update desc (prioritize infrequent feeds) +// GetFeedsDueForCheck returns feeds for feed_check, ordered by last_checked_at ASC (oldest first) func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -325,8 +325,8 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { no_update, publish_status, publish_account FROM feeds - WHERE next_crawl_at <= NOW() AND status = 'pass' - ORDER BY no_update DESC + WHERE last_checked_at > '0001-01-01 00:00:00' AND status = 'pass' + ORDER BY last_checked_at ASC LIMIT $1 `, limit) if err != nil { @@ -341,7 +341,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -363,7 +363,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { tsquery := ToSearchQuery(query) rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -389,7 +389,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { for rows.Next() { feed := &Feed{} var feedType, category, title, description, language, siteURL *string - var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time + var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time var etag, lastModified, lastError, sourceURL, sourceHost, tld *string var itemCount, noUpdate *int var status *string @@ -397,7 +397,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { if err := rows.Scan( &feed.URL, &feedType, &category, &title, &description, &language, &siteURL, - &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &status, &lastError, &lastErrorAt, &sourceURL, &sourceHost, &tld, @@ -419,8 +419,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { feed.Description = StringValue(description) feed.Language = StringValue(language) feed.SiteURL = StringValue(siteURL) - feed.LastCrawledAt = TimeValue(lastCrawledAt) - feed.NextCrawlAt = TimeValue(nextCrawlAt) + feed.LastCheckedAt = TimeValue(lastCheckedAt) + feed.NextCheckAt = TimeValue(nextCheckAt) feed.LastBuildDate = TimeValue(lastBuildDate) feed.ETag = StringValue(etag) feed.LastModified = StringValue(lastModified) @@ -471,7 +471,7 @@ func (c *Crawler) SetPublishStatus(feedURL, status, account string) error { func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, @@ -493,7 +493,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) { rows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, - discovered_at, last_crawled_at, next_crawl_at, last_build_date, + discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, source_url, source_host, tld, diff --git a/feed_check.go b/feed_check.go index 92232fa..742430e 100644 --- a/feed_check.go +++ b/feed_check.go @@ -31,7 +31,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea Type: feedType, Category: classifyFeed(feedURL), DiscoveredAt: now, - LastCrawledAt: now, + LastCheckedAt: now, Status: "pass", SourceHost: sourceHost, TLD: getTLD(sourceHost), @@ -53,8 +53,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea // Refine category based on parsed title (e.g., "Comments on:") feed.Category = classifyFeedByTitle(feed.Title, feed.Category) - // Calculate next crawl time - feed.NextCrawlAt = c.calculateNextCrawl(feed) + // Calculate next feed_check time + feed.NextCheckAt = c.calculateNextCheck(feed) if err := c.saveFeed(feed); err != nil { return @@ -92,7 +92,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { SourceURL: normalizeURL(sourceURL), SourceHost: sourceHost, TLD: getTLD(sourceHost), - NextCrawlAt: now, // Should be crawled immediately + NextCheckAt: now, // Should be crawled immediately } if err := c.saveFeed(feed); err != nil { @@ -148,9 +148,9 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { err = fmt.Errorf("all URL variants failed") } now := time.Now() - feed.LastCrawledAt = now + feed.LastCheckedAt = now feed.NoUpdate++ - feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = err.Error() feed.LastErrorAt = now feed.Status = "hold" @@ -165,13 +165,13 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { defer resp.Body.Close() now := time.Now() - feed.LastCrawledAt = now + feed.LastCheckedAt = now // 304 Not Modified - feed hasn't changed if resp.StatusCode == http.StatusNotModified { feed.NoUpdate++ // Adaptive backoff: 100s base + 100s per consecutive no-change - feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = "" feed.Status = "pass" // Auto-hold feeds after 1000 consecutive no-changes @@ -186,7 +186,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { // Non-200 response if resp.StatusCode != http.StatusOK { feed.NoUpdate++ - feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = resp.Status feed.LastErrorAt = now feed.Status = "hold" @@ -203,7 +203,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { bodyBytes, err := io.ReadAll(resp.Body) if err != nil { feed.NoUpdate++ - feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) feed.LastError = err.Error() feed.LastErrorAt = now feed.Status = "hold" @@ -238,7 +238,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { // Content changed - reset backoff feed.NoUpdate = 0 - feed.NextCrawlAt = now.Add(100 * time.Second) + feed.NextCheckAt = now.Add(100 * time.Second) feed.LastError = "" feed.Status = "pass" c.saveFeed(feed) diff --git a/parser.go b/parser.go index 02b2805..3347ee5 100644 --- a/parser.go +++ b/parser.go @@ -386,8 +386,8 @@ func parseRSSDate(s string) (time.Time, error) { return time.Time{}, fmt.Errorf("unable to parse date: %s", s) } -// calculateNextCrawl determines when to next crawl this feed -func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time { +// calculateNextCheck determines when to next check this feed (feed_check) +func (c *Crawler) calculateNextCheck(feed *Feed) time.Time { // Adaptive backoff: 100s base + 100s per consecutive no-change return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second) } diff --git a/routes.go b/routes.go index 67a556f..3ae2e0b 100644 --- a/routes.go +++ b/routes.go @@ -109,6 +109,9 @@ func (c *Crawler) StartDashboard(addr string) error { http.HandleFunc("/api/tlds", withAuth(func(w http.ResponseWriter, r *http.Request) { c.handleAPITLDs(w, r) })) + http.HandleFunc("/api/searchStats", withAuth(func(w http.ResponseWriter, r *http.Request) { + c.handleAPISearchStats(w, r) + })) http.HandleFunc("/api/tldDomains", withAuth(func(w http.ResponseWriter, r *http.Request) { c.handleAPITLDDomains(w, r) })) diff --git a/scripts/deploy.sh b/scripts/deploy.sh deleted file mode 100755 index 3deb0ff..0000000 --- a/scripts/deploy.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Deploy script - increments version, commits, pushes, and relaunches container -# Usage: ./scripts/deploy.sh [optional commit message] - -set -e - -cd "$(dirname "$0")/.." - -# Extract current version number from templates.go -CURRENT=$(grep -o '>v[0-9]*<' templates.go | grep -o '[0-9]*' | head -1) - -if [ -z "$CURRENT" ]; then - echo "Could not find version number in templates.go" - exit 1 -fi - -# Increment version -NEW=$((CURRENT + 1)) - -# Update templates.go -sed -i '' "s/>v${CURRENT}v${NEW} v${NEW}" - -# Build commit message -if [ -n "$1" ]; then - COMMIT_MSG="v${NEW}: $1" -else - COMMIT_MSG="v${NEW}" -fi - -# Commit and push -git add -A -git commit -m "$COMMIT_MSG" -git push - -echo "Committed: $COMMIT_MSG" - -# Rebuild and relaunch -docker compose up -d --build - -echo "Deployed v${NEW}" diff --git a/static/dashboard.js b/static/dashboard.js index 88ad78c..9d58a11 100644 --- a/static/dashboard.js +++ b/static/dashboard.js @@ -14,6 +14,142 @@ function initDashboard() { let infiniteScrollState = null; let isLoadingMore = false; let searchQuery = ''; + let domainFilter = 'all'; // all, pass, skip, hold, dead + // Feed filter: multi-select with ALL as exclusion toggle + // When allSelected=true, selected items are EXCLUDED; when false, selected items are INCLUDED + let feedFilter = { allSelected: false, statuses: [], types: [] }; + let currentOpenTLD = null; // Track which TLD is currently open + + // Smart sticky header - scroll normally, show fixed on scroll up + let lastScrollY = 0; + const topSection = document.getElementById('topSection'); + const spacer = document.getElementById('topSectionSpacer'); + let headerHeight = topSection.offsetHeight; + let isFixed = false; + + window.addEventListener('scroll', () => { + const currentScrollY = window.scrollY; + + // If at top, return to normal flow + if (currentScrollY <= 0) { + topSection.classList.remove('fixed', 'hidden'); + spacer.classList.remove('active'); + isFixed = false; + lastScrollY = currentScrollY; + return; + } + + // Only activate fixed mode after scrolling past the header + if (currentScrollY > headerHeight) { + if (currentScrollY < lastScrollY) { + // Scrolling up - show fixed header + if (!isFixed) { + spacer.style.height = headerHeight + 'px'; + spacer.classList.add('active'); + topSection.classList.add('fixed'); + // Start hidden, then show + topSection.classList.add('hidden'); + requestAnimationFrame(() => { + topSection.classList.remove('hidden'); + }); + isFixed = true; + } else { + topSection.classList.remove('hidden'); + } + } else if (currentScrollY > lastScrollY && isFixed) { + // Scrolling down while fixed - hide it + topSection.classList.add('hidden'); + } + } + + lastScrollY = currentScrollY; + }, { passive: true }); + + // Stat card click handler + document.addEventListener('click', (e) => { + const card = e.target.closest('.card.clickable'); + if (!card) return; + + const filterType = card.dataset.filter; + const status = card.dataset.status; + const type = card.dataset.type; + + if (filterType === 'domain') { + // Remove active from domain cards only + document.querySelectorAll('.card.clickable[data-filter="domain"]').forEach(c => c.classList.remove('active')); + card.classList.add('active'); + domainFilter = status || 'all'; + + // Update placeholder + const searchInput = document.getElementById('searchInput'); + searchInput.placeholder = domainFilter === 'all' ? 'Search domains...' : `Showing ${domainFilter} domains...`; + + // Reload TLD list with new filter + loadFeeds(searchQuery); + } else if (filterType === 'feed') { + const wasActive = card.classList.contains('active'); + + if (status === 'all') { + // ALL card toggles exclusion mode + if (wasActive) { + card.classList.remove('active'); + feedFilter.allSelected = false; + } else { + card.classList.add('active'); + feedFilter.allSelected = true; + } + } else if (status) { + // Status card (pass, skip, hold, dead) - multi-select + if (wasActive) { + card.classList.remove('active'); + feedFilter.statuses = feedFilter.statuses.filter(s => s !== status); + } else { + card.classList.add('active'); + feedFilter.statuses.push(status); + } + } else if (type) { + // Type card (rss, atom, json, unknown, empty) - multi-select + if (wasActive) { + card.classList.remove('active'); + feedFilter.types = feedFilter.types.filter(t => t !== type); + } else { + card.classList.add('active'); + feedFilter.types.push(type); + } + } + + // Reload TLD list with feed filter + loadFeeds(searchQuery); + } + }); + + // Refresh only expanded TLD sections with new domain filter + function refreshExpandedTLDs() { + const expandedContainer = document.getElementById('expandedTLDContent'); + if (expandedContainer && expandedContainer.style.display !== 'none' && expandedContainer.dataset.tld) { + // Mark as needing reload and reload + expandedContainer.dataset.loaded = 'false'; + loadTLDDomains(expandedContainer, searchQuery); + } + } + + // Apply feed filter to currently visible feeds + function applyFeedFilter() { + document.querySelectorAll('.inline-feed-block').forEach(block => { + const feedStatus = block.dataset.status || 'hold'; + const feedType = block.dataset.type || 'unknown'; + + let show = true; + if (feedFilter.status !== 'all' && feedStatus !== feedFilter.status) { + show = false; + } + if (feedFilter.type && feedType !== feedFilter.type) { + show = false; + } + + block.style.display = show ? 'block' : 'none'; + }); + } // Event delegation for domain-spacer clicks (toggle feeds) document.addEventListener('click', (e) => { @@ -96,8 +232,8 @@ function initDashboard() { ['Oldest Item', f.oldestItemDate], ['Newest Item', f.newestItemDate], ['Discovered', f.discoveredAt], - ['Last Crawled', f.lastCrawledAt], - ['Next Crawl', f.nextCrawlAt], + ['Last Checked', f.lastCheckedAt], + ['Next Check', f.nextCheckAt], ['Publish Status', f.publishStatus], ['Publish Account', f.publishAccount], ]; @@ -122,7 +258,8 @@ function initDashboard() { const items = await resp.json(); if (!items || items.length === 0) { - itemsDiv.innerHTML = 'No items'; + // Just clear the items area, keep the feed visible + itemsDiv.innerHTML = ''; return; } @@ -173,7 +310,6 @@ function initDashboard() { function renderTLDHeader(tld) { return `
- .${escapeHtml(tld)}
`; @@ -192,45 +328,163 @@ function initDashboard() { } } - // Event delegation for TLD header/footer clicks (toggle section) + // Event delegation for TLD clicks (toggle section) document.addEventListener('click', (e) => { const tldHeader = e.target.closest('.tld-header'); const tldFooter = e.target.closest('.tld-footer'); + const expandedContainer = document.getElementById('expandedTLDContent'); + + // Handle clicks in expanded container header + if (tldHeader && tldHeader.closest('#expandedTLDContent')) { + // Close the expanded content + const currentSection = document.querySelector('.tld-section.expanded'); + if (currentSection) { + currentSection.classList.remove('expanded'); + } + expandedContainer.style.display = 'none'; + expandedContainer.innerHTML = ''; + currentOpenTLD = null; + // Show TLD list again + const domainList = document.querySelector('.domain-list'); + if (domainList) domainList.style.display = ''; + updateStats(); // Revert to search or all stats + return; + } + + // Handle clicks on TLD cards if (tldHeader || tldFooter) { const section = (tldHeader || tldFooter).closest('.tld-section'); if (section) { - const content = section.querySelector('.tld-content'); - const toggle = section.querySelector('.tld-toggle'); - if (content) { - const isVisible = content.style.display !== 'none'; - content.style.display = isVisible ? 'none' : 'block'; - if (toggle) toggle.textContent = isVisible ? '▶' : '▼'; + const tld = section.dataset.tld; + const isExpanded = section.classList.contains('expanded'); - if (isVisible) { - // Closing - scroll to next TLD section - const nextSection = section.nextElementSibling; - if (nextSection && nextSection.classList.contains('tld-section')) { - nextSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); - } - } else { - // Opening - load domains if not already loaded - if (section.dataset.loaded === 'false') { - loadTLDDomains(section, searchQuery); - } - } + if (isExpanded) { + // Closing this TLD + section.classList.remove('expanded'); + expandedContainer.style.display = 'none'; + expandedContainer.innerHTML = ''; + currentOpenTLD = null; + // Show TLD list again + const domainList = document.querySelector('.domain-list'); + if (domainList) domainList.style.display = ''; + updateStats(); // Revert to search or all stats + } else { + // Close any other open TLD first + document.querySelectorAll('.tld-section.expanded').forEach(s => { + s.classList.remove('expanded'); + }); + + // Opening this TLD + section.classList.add('expanded'); + currentOpenTLD = tld; + // Hide TLD list + const domainList = document.querySelector('.domain-list'); + if (domainList) domainList.style.display = 'none'; + // Show TLD stats (filtered by search if active) + const currentSearch = document.getElementById('searchInput').value.trim(); + updateStatsForTLD(tld, currentSearch); + + // Set up expanded container with header + expandedContainer.innerHTML = ` +
+ .${escapeHtml(tld)} +
+
+
Loading...
+
+ `; + expandedContainer.style.display = 'block'; + expandedContainer.dataset.tld = tld; + expandedContainer.dataset.loaded = 'false'; + + // Load domains + loadTLDDomains(expandedContainer, searchQuery); + + // Scroll to expanded container + expandedContainer.scrollIntoView({ behavior: 'smooth', block: 'start' }); } } } }); + // Update stats for a specific TLD (optionally filtered by search) + async function updateStatsForTLD(tld, search = '') { + try { + let url = `/api/tldStats?tld=${encodeURIComponent(tld)}`; + if (search) { + url += `&search=${encodeURIComponent(search)}`; + } + const resp = await fetch(url); + if (!resp.ok) return; + const stats = await resp.json(); + + document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains || 0); + document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains || 0); + document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains || 0); + document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains || 0); + document.getElementById('deadDomains').textContent = commaFormat(stats.dead_domains || 0); + + document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds || 0); + document.getElementById('aliveFeeds').textContent = commaFormat(stats.alive_feeds || 0); + document.getElementById('publishFeeds').textContent = commaFormat(stats.publish_feeds || 0); + document.getElementById('skipFeeds').textContent = commaFormat(stats.skip_feeds || 0); + document.getElementById('holdFeeds').textContent = commaFormat(stats.hold_feeds || 0); + document.getElementById('deadFeeds').textContent = commaFormat(stats.dead_feeds || 0); + document.getElementById('emptyFeeds').textContent = commaFormat(stats.empty_feeds || 0); + document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds || 0); + document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds || 0); + document.getElementById('jsonFeeds').textContent = commaFormat(stats.json_feeds || 0); + document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds || 0); + + document.getElementById('updatedAt').textContent = search ? `Search "${search}" in .${tld}` : `Stats for .${tld}`; + } catch (err) { + console.error('TLD stats update failed:', err); + } + } + + // Update stats for search results + async function updateStatsForSearch(query) { + try { + const resp = await fetch(`/api/searchStats?search=${encodeURIComponent(query)}`); + if (!resp.ok) { + console.error('Search stats failed:', resp.status); + return; + } + const stats = await resp.json(); + + document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains || 0); + document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains || 0); + document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains || 0); + document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains || 0); + document.getElementById('deadDomains').textContent = commaFormat(stats.dead_domains || 0); + + document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds || 0); + document.getElementById('aliveFeeds').textContent = commaFormat(stats.alive_feeds || 0); + document.getElementById('publishFeeds').textContent = commaFormat(stats.publish_feeds || 0); + document.getElementById('skipFeeds').textContent = commaFormat(stats.skip_feeds || 0); + document.getElementById('holdFeeds').textContent = commaFormat(stats.hold_feeds || 0); + document.getElementById('deadFeeds').textContent = commaFormat(stats.dead_feeds || 0); + document.getElementById('emptyFeeds').textContent = commaFormat(stats.empty_feeds || 0); + document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds || 0); + document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds || 0); + document.getElementById('jsonFeeds').textContent = commaFormat(stats.json_feeds || 0); + document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds || 0); + + document.getElementById('updatedAt').textContent = `Search: "${query}"`; + } catch (err) { + console.error('Search stats update failed:', err); + } + } + // Render domain row with feeds function renderDomainRow(d) { const status = d.status || 'hold'; - let html = `
`; + const fullDomain = d.tld ? d.host + '.' + d.tld : d.host; + let html = `
`; html += `
`; - html += renderStatusBtns(status, 'domain', d.host); - html += `${escapeHtml(d.host)}`; + html += renderStatusBtns(status, 'domain', fullDomain); + html += `${escapeHtml(fullDomain)}`; if (d.last_error) { html += `${escapeHtml(d.last_error)}`; @@ -244,7 +498,8 @@ function initDashboard() { html += '
'; d.feeds.forEach(f => { const feedStatus = f.publish_status || 'hold'; - html += `
`; + const feedType = f.type || 'unknown'; + html += `
`; html += `
`; html += `${escapeHtml(f.language || '')} `; @@ -341,7 +596,7 @@ function initDashboard() { async function loadFeeds(query = '') { const output = document.getElementById('output'); - output.innerHTML = '
Loading TLDs...
'; + output.innerHTML = '
Loading TLDs...
'; // Disconnect previous observer if any if (tldObserver) { @@ -349,26 +604,59 @@ function initDashboard() { } try { - // Fetch all TLDs first - const tldsResp = await fetch('/api/tlds?has_feeds=true'); + // Fetch TLDs with optional domain status filter, feed filter, and search + let tldsUrl = '/api/tlds'; + const params = []; + if (domainFilter !== 'all') { + params.push(`status=${domainFilter}`); + } + // Add feed filter params if any are selected + if (feedFilter.allSelected || feedFilter.statuses.length > 0 || feedFilter.types.length > 0) { + if (feedFilter.allSelected) { + params.push('feedMode=exclude'); + } else { + params.push('feedMode=include'); + } + if (feedFilter.statuses.length > 0) { + params.push(`feedStatuses=${feedFilter.statuses.join(',')}`); + } + if (feedFilter.types.length > 0) { + params.push(`feedTypes=${feedFilter.types.join(',')}`); + } + } + if (query) { + params.push(`search=${encodeURIComponent(query)}`); + } + if (params.length > 0) { + tldsUrl += '?' + params.join('&'); + } + const tldsResp = await fetch(tldsUrl); + if (!tldsResp.ok) { + const errText = await tldsResp.text(); + throw new Error(`HTTP ${tldsResp.status}: ${errText}`); + } const tlds = await tldsResp.json(); if (!tlds || tlds.length === 0) { - document.getElementById('infiniteLoader').textContent = 'No feeds found'; + // Update stats for empty results + if (query) { + await updateStatsForSearch(query); + } else { + await updateStats(); + } + document.getElementById('infiniteLoader').textContent = query ? 'No matches found' : 'No feeds found'; return; } const container = output.querySelector('.domain-list'); - // Render all TLD sections as collapsed placeholders + // Render all TLD sections as card placeholders tlds.forEach(t => { const tld = t.tld || 'unknown'; container.insertAdjacentHTML('beforeend', `
-
- - .${escapeHtml(tld)} - (${t.domain_count} domains) +
+ .${escapeHtml(tld)}