diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2283752 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +1440.news +1440.db +feeds/ +*.gz +.git +.gitignore +.claude +CLAUDE.md diff --git a/.gitignore b/.gitignore index 460c35d..ef46060 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ go.* *.gz feeds/ feeds.db/ +1440.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c6d849c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM golang:1.24-alpine AS builder + +WORKDIR /app + +# Install build dependencies +RUN apk add --no-cache gcc musl-dev + +# Copy go mod files first for layer caching +COPY go.mod go.sum ./ +RUN go mod download + +# Copy source code +COPY *.go ./ +COPY static/ ./static/ + +# Build the binary +RUN CGO_ENABLED=1 go build -o 1440.news . + +# Runtime stage +FROM alpine:latest + +WORKDIR /app + +# Install runtime dependencies +RUN apk add --no-cache ca-certificates tzdata + +# Copy binary from builder +COPY --from=builder /app/1440.news . +COPY --from=builder /app/static ./static + +# Create feeds directory +RUN mkdir -p feeds + +# Expose dashboard port +EXPOSE 4321 + +CMD ["./1440.news"] diff --git a/crawler.go b/crawler.go index 215179c..1b54c09 100644 --- a/crawler.go +++ b/crawler.go @@ -1,9 +1,9 @@ package main import ( + "database/sql" "fmt" "io" - "math/rand" "net/http" "runtime" "strings" @@ -11,26 +11,33 @@ import ( "sync/atomic" "time" - "github.com/cockroachdb/pebble" "golang.org/x/net/html" ) type Crawler struct { - MaxDepth int - MaxPagesPerHost int - Timeout time.Duration - UserAgent string - visited sync.Map - feedsMu sync.Mutex - client *http.Client - hostsProcessed int32 - db *pebble.DB + MaxDepth int + MaxPagesPerHost int + Timeout time.Duration + UserAgent string + visited sync.Map + feedsMu sync.Mutex + client *http.Client + hostsProcessed int32 + feedsChecked int32 + startTime time.Time + db *sql.DB + displayedCrawlRate int + displayedCheckRate int + domainsImported int32 + cachedStats *DashboardStats + cachedAllDomains []DomainStat + statsMu sync.RWMutex } func NewCrawler(dbPath string) (*Crawler, error) { - db, err := pebble.Open(dbPath, &pebble.Options{}) + db, err := OpenDatabase(dbPath) if err != nil { - return nil, fmt.Errorf("failed to open pebble db: %v", err) + return nil, fmt.Errorf("failed to open database: %v", err) } return &Crawler{ @@ -38,6 +45,7 @@ func NewCrawler(dbPath string) (*Crawler, error) { MaxPagesPerHost: 10, Timeout: 10 * time.Second, UserAgent: "FeedCrawler/1.0", + startTime: time.Now(), db: db, client: &http.Client{ Timeout: 10 * time.Second, @@ -58,87 +66,121 @@ func (c *Crawler) Close() error { return nil } -// CrawlUncrawledDomains fetches uncrawled domains and crawls them -func (c *Crawler) CrawlUncrawledDomains() error { - domains, err := c.GetUncrawledDomains() - if err != nil { - return fmt.Errorf("failed to get uncrawled domains: %v", err) +// StartStatsLoop updates cached stats once per minute +func (c *Crawler) StartStatsLoop() { + for { + c.UpdateStats() + time.Sleep(1 * time.Minute) } +} - if len(domains) == 0 { - return nil +// StartCleanupLoop runs item cleanup once per week +func (c *Crawler) StartCleanupLoop() { + for { + deleted, err := c.CleanupOldItems() + if err != nil { + fmt.Printf("Cleanup error: %v\n", err) + } else if deleted > 0 { + fmt.Printf("Cleanup: removed %d old items\n", deleted) + } + time.Sleep(7 * 24 * time.Hour) } +} - // Shuffle for randomized crawling - rand.Shuffle(len(domains), func(i, j int) { - domains[i], domains[j] = domains[j], domains[i] - }) - - numWorkers := runtime.NumCPU() - 1 +// StartCrawlLoop runs the domain crawling loop independently +func (c *Crawler) StartCrawlLoop() { + numWorkers := runtime.NumCPU() if numWorkers < 1 { numWorkers = 1 } - type crawlResult struct { - host string - feedsFound int - lastError string - } - - domainChan := make(chan *Domain, numWorkers*2) - resultChan := make(chan crawlResult, numWorkers*2) - var wg sync.WaitGroup + // Buffered channel for domain work + workChan := make(chan *Domain, 256) // Start workers for i := 0; i < numWorkers; i++ { - wg.Add(1) go func() { - defer wg.Done() - for domain := range domainChan { + for domain := range workChan { feedsFound, crawlErr := c.crawlHost(domain.Host) errStr := "" if crawlErr != nil { errStr = crawlErr.Error() } - resultChan <- crawlResult{ - host: domain.Host, - feedsFound: feedsFound, - lastError: errStr, + if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil { + fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err) } } }() } - // Start result processor - done := make(chan bool) - go func() { - for result := range resultChan { - if err := c.markDomainCrawled(result.host, result.feedsFound, result.lastError); err != nil { - fmt.Printf("Error marking domain %s as crawled: %v\n", result.host, err) - } + const fetchSize = 100 + for { + domains, err := c.GetUncheckedDomainsRandom(fetchSize) + if err != nil { + fmt.Printf("Error fetching domains: %v\n", err) } - done <- true - }() - // Send domains to workers - for _, domain := range domains { - domainChan <- domain + if len(domains) == 0 { + c.displayedCrawlRate = 0 + time.Sleep(1 * time.Second) + continue + } + + fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains)) + + for _, domain := range domains { + workChan <- domain + } + + time.Sleep(1 * time.Second) + } +} + +// StartCheckLoop runs the feed checking loop independently +func (c *Crawler) StartCheckLoop() { + numWorkers := runtime.NumCPU() + if numWorkers < 1 { + numWorkers = 1 } - close(domainChan) - wg.Wait() - close(resultChan) - <-done + // Buffered channel for feed work + workChan := make(chan *Feed, 256) - return nil + // Start workers + for i := 0; i < numWorkers; i++ { + go func() { + for feed := range workChan { + c.CheckFeed(feed) + } + }() + } + + const fetchSize = 100 + for { + feeds, err := c.GetFeedsDueForCheck(fetchSize) + if err != nil { + fmt.Printf("Error fetching feeds: %v\n", err) + } + + if len(feeds) == 0 { + c.displayedCheckRate = 0 + time.Sleep(1 * time.Second) + continue + } + + fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds)) + + for _, feed := range feeds { + workChan <- feed + } + + time.Sleep(1 * time.Second) + } } func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { atomic.AddInt32(&c.hostsProcessed, 1) - // Count feeds before crawling - initialCount, _ := c.GetFeedCount() - localVisited := make(map[string]bool) pagesVisited := 0 @@ -148,9 +190,8 @@ func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited) } - // Count feeds after crawling - finalCount, _ := c.GetFeedCount() - feedsFound = finalCount - initialCount + // Count feeds found for this specific host + feedsFound, _ = c.GetFeedCountByHost(host) if pagesVisited == 0 { return feedsFound, fmt.Errorf("could not connect") diff --git a/dashboard.go b/dashboard.go index a362b2d..99d1187 100644 --- a/dashboard.go +++ b/dashboard.go @@ -1,21 +1,20 @@ package main import ( + "database/sql" "encoding/json" "fmt" "html/template" "net/http" - "sort" "time" ) // DashboardStats holds all statistics for the dashboard type DashboardStats struct { // Domain stats - TotalDomains int `json:"total_domains"` - CrawledDomains int `json:"crawled_domains"` - UncrawledDomains int `json:"uncrawled_domains"` - ErrorDomains int `json:"error_domains"` + TotalDomains int `json:"total_domains"` + CheckedDomains int `json:"checked_domains"` + UncheckedDomains int `json:"unchecked_domains"` // Feed stats TotalFeeds int `json:"total_feeds"` @@ -25,16 +24,8 @@ type DashboardStats struct { // Crawl progress HostsProcessed int32 `json:"hosts_processed"` - CrawlRate float64 `json:"crawl_rate"` // domains per minute - - // Top TLDs by feed count - TopTLDs []TLDStat `json:"top_tlds"` - - // Recent feeds - RecentFeeds []RecentFeed `json:"recent_feeds"` - - // Top domains by feed count - TopDomains []DomainStat `json:"top_domains"` + CrawlRate int `json:"crawl_rate"` // crawls per minute + CheckRate int `json:"check_rate"` // feed checks per minute // Timing UpdatedAt time.Time `json:"updated_at"` @@ -57,13 +48,107 @@ type DomainStat struct { FeedsFound int `json:"feeds_found"` } -// GetDashboardStats collects all statistics for the dashboard +// commaFormat formats an integer with comma separators +func commaFormat(n int) string { + s := fmt.Sprintf("%d", n) + if len(s) <= 3 { + return s + } + var result []byte + for i, c := range s { + if i > 0 && (len(s)-i)%3 == 0 { + result = append(result, ',') + } + result = append(result, byte(c)) + } + return string(result) +} + +// UpdateStats recalculates and caches dashboard statistics +func (c *Crawler) UpdateStats() { + fmt.Println("UpdateStats: calculating stats...") + stats, err := c.calculateStats() + if err != nil { + fmt.Printf("UpdateStats: error calculating stats: %v\n", err) + return + } + // Cache all domains with feeds (runs in background, so slow query is OK) + fmt.Println("UpdateStats: fetching all domains...") + allDomains := c.fetchAllDomainsFromDB() + fmt.Printf("UpdateStats: got %d domains\n", len(allDomains)) + + c.statsMu.Lock() + c.cachedStats = stats + c.cachedAllDomains = allDomains + c.statsMu.Unlock() + fmt.Println("UpdateStats: complete") +} + +func (c *Crawler) fetchAllDomainsFromDB() []DomainStat { + rows, err := c.db.Query(` + SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds + GROUP BY tld, sourceHost + ORDER BY tld, sourceHost + `) + if err != nil { + fmt.Printf("fetchAllDomainsFromDB error: %v\n", err) + return nil + } + defer rows.Close() + + var domains []DomainStat + for rows.Next() { + var ds DomainStat + var tld string + if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil { + continue + } + domains = append(domains, ds) + } + return domains +} + +// GetDashboardStats returns cached statistics (returns empty stats if not yet cached) func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { + c.statsMu.RLock() + stats := c.cachedStats + c.statsMu.RUnlock() + + if stats != nil { + return stats, nil + } + // Return empty stats while background calculation runs (don't block HTTP requests) + return &DashboardStats{UpdatedAt: time.Now()}, nil +} + +// calculateStats collects all statistics for the dashboard +func (c *Crawler) calculateStats() (*DashboardStats, error) { stats := &DashboardStats{ - UpdatedAt: time.Now(), + UpdatedAt: time.Now(), HostsProcessed: c.hostsProcessed, } + // Calculate crawl rate (crawls per minute), smoothed by +/-1 per update + elapsed := time.Since(c.startTime).Minutes() + if elapsed > 0 { + actualRate := int(float64(c.hostsProcessed) / elapsed) + if actualRate > c.displayedCrawlRate { + c.displayedCrawlRate++ + } else if actualRate < c.displayedCrawlRate { + c.displayedCrawlRate-- + } + stats.CrawlRate = c.displayedCrawlRate + + // Calculate check rate (feed checks per minute), smoothed by +/-1 per update + actualCheckRate := int(float64(c.feedsChecked) / elapsed) + if actualCheckRate > c.displayedCheckRate { + c.displayedCheckRate++ + } else if actualCheckRate < c.displayedCheckRate { + c.displayedCheckRate-- + } + stats.CheckRate = c.displayedCheckRate + } + // Get domain stats if err := c.collectDomainStats(stats); err != nil { return nil, err @@ -78,148 +163,455 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { } func (c *Crawler) collectDomainStats(stats *DashboardStats) error { - iter, err := c.db.NewIter(nil) + // Use MAX(rowid) for fast approximate total count + err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains) if err != nil { return err } - defer iter.Close() - domainFeeds := make(map[string]int) + // Single query to get all status counts (one index scan instead of three) + rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status") + if err != nil { + return err + } + defer rows.Close() - for iter.SeekGE([]byte("domain:")); iter.Valid(); iter.Next() { - key := string(iter.Key()) - if len(key) < 7 || key[:7] != "domain:" { - break - } - - var domain Domain - if err := json.Unmarshal(iter.Value(), &domain); err != nil { + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { continue } - - stats.TotalDomains++ - switch domain.Status { - case "crawled": - stats.CrawledDomains++ - if domain.FeedsFound > 0 { - domainFeeds[domain.Host] = domain.FeedsFound - } - case "uncrawled": - stats.UncrawledDomains++ - case "error": - stats.ErrorDomains++ + switch status { + case "checked": + stats.CheckedDomains = count + case "unchecked": + stats.UncheckedDomains = count } } - - // Top domains by feed count - type kv struct { - Host string - Count int - } - var sorted []kv - for h, c := range domainFeeds { - sorted = append(sorted, kv{h, c}) - } - sort.Slice(sorted, func(i, j int) bool { - return sorted[i].Count > sorted[j].Count - }) - for i := 0; i < len(sorted) && i < 10; i++ { - stats.TopDomains = append(stats.TopDomains, DomainStat{ - Host: sorted[i].Host, - FeedsFound: sorted[i].Count, - }) + if err := rows.Err(); err != nil { + return err } - return iter.Error() + return rows.Err() } func (c *Crawler) collectFeedStats(stats *DashboardStats) error { - iter, err := c.db.NewIter(nil) + // Use MAX(rowid) for fast approximate total count + err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds) if err != nil { return err } - defer iter.Close() - tldCounts := make(map[string]int) - var recentFeeds []RecentFeed + // Single query to get all type counts (one index scan instead of three) + rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type") + if err != nil { + return err + } + defer rows.Close() - for iter.SeekGE([]byte("feed:")); iter.Valid(); iter.Next() { - key := string(iter.Key()) - if len(key) < 5 || key[:5] != "feed:" { - break - } - - var feed Feed - if err := json.Unmarshal(iter.Value(), &feed); err != nil { + for rows.Next() { + var feedType sql.NullString + var count int + if err := rows.Scan(&feedType, &count); err != nil { continue } - - stats.TotalFeeds++ - switch feed.Type { + switch feedType.String { case "rss": - stats.RSSFeeds++ + stats.RSSFeeds = count case "atom": - stats.AtomFeeds++ + stats.AtomFeeds = count default: - stats.UnknownFeeds++ + stats.UnknownFeeds += count } - - if feed.TLD != "" { - tldCounts[feed.TLD]++ - } - - recentFeeds = append(recentFeeds, RecentFeed{ - URL: feed.URL, - Title: feed.Title, - Type: feed.Type, - DiscoveredAt: feed.DiscoveredAt, - }) } - - // Top TLDs - type kv struct { - TLD string - Count int - } - var sortedTLDs []kv - for t, c := range tldCounts { - sortedTLDs = append(sortedTLDs, kv{t, c}) - } - sort.Slice(sortedTLDs, func(i, j int) bool { - return sortedTLDs[i].Count > sortedTLDs[j].Count - }) - for i := 0; i < len(sortedTLDs) && i < 10; i++ { - stats.TopTLDs = append(stats.TopTLDs, TLDStat{ - TLD: sortedTLDs[i].TLD, - Count: sortedTLDs[i].Count, - }) - } - - // Recent feeds (last 20, sorted by discovery time) - sort.Slice(recentFeeds, func(i, j int) bool { - return recentFeeds[i].DiscoveredAt.After(recentFeeds[j].DiscoveredAt) - }) - if len(recentFeeds) > 20 { - recentFeeds = recentFeeds[:20] - } - stats.RecentFeeds = recentFeeds - - return iter.Error() + return rows.Err() } // StartDashboard starts the web dashboard server func (c *Crawler) StartDashboard(addr string) error { - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) { c.handleDashboard(w, r) }) http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) { c.handleAPIStats(w, r) }) + http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIAllDomains(w, r) + }) + http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDomainFeeds(w, r) + }) + http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFeedInfo(w, r) + }) + http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFeedItems(w, r) + }) + http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) { + c.handleAPISearch(w, r) + }) + http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) { + http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r) + }) fmt.Printf("Dashboard running at http://%s\n", addr) return http.ListenAndServe(addr, nil) } +func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) { + offset := 0 + limit := 100 + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 100 { + limit = 100 + } + } + + // Serve from cache (updated once per minute in background) + c.statsMu.RLock() + cached := c.cachedAllDomains + c.statsMu.RUnlock() + + var domains []DomainStat + if cached != nil && offset < len(cached) { + end := offset + limit + if end > len(cached) { + end = len(cached) + } + domains = cached[offset:end] + } + if domains == nil { + domains = []DomainStat{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(domains) +} + +func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { + host := r.URL.Query().Get("host") + if host == "" { + http.Error(w, "host parameter required", http.StatusBadRequest) + return + } + + rows, err := c.db.Query(` + SELECT url, title, type FROM feeds + WHERE sourceHost = ? + ORDER BY url ASC + LIMIT 1000 + `, host) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type FeedInfo struct { + URL string `json:"url"` + Title string `json:"title"` + Type string `json:"type"` + } + + var feeds []FeedInfo + for rows.Next() { + var f FeedInfo + var title sql.NullString + if err := rows.Scan(&f.URL, &title, &f.Type); err != nil { + continue + } + if title.Valid { + f.Title = title.String + } + feeds = append(feeds, f) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(feeds) +} + +func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + type FeedDetails struct { + URL string `json:"url"` + Type string `json:"type,omitempty"` + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Language string `json:"language,omitempty"` + SiteURL string `json:"siteUrl,omitempty"` + DiscoveredAt string `json:"discoveredAt,omitempty"` + LastCrawledAt string `json:"lastCrawledAt,omitempty"` + LastBuildDate string `json:"lastBuildDate,omitempty"` + TTLMinutes int `json:"ttlMinutes,omitempty"` + UpdatePeriod string `json:"updatePeriod,omitempty"` + UpdateFreq int `json:"updateFreq,omitempty"` + Status string `json:"status,omitempty"` + ErrorCount int `json:"errorCount,omitempty"` + LastError string `json:"lastError,omitempty"` + ItemCount int `json:"itemCount,omitempty"` + AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"` + OldestItemDate string `json:"oldestItemDate,omitempty"` + NewestItemDate string `json:"newestItemDate,omitempty"` + } + + var f FeedDetails + var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString + var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString + var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64 + var avgPostFreqHrs sql.NullFloat64 + + err := c.db.QueryRow(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, lastBuildDate, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate + FROM feeds WHERE url = ? + `, feedURL).Scan( + &f.URL, &f.Type, &title, &description, &language, &siteUrl, + &f.DiscoveredAt, &lastCrawledAt, &lastBuildDate, + &ttlMinutes, &updatePeriod, &updateFreq, + &status, &errorCount, &lastError, + &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + ) + + if err == sql.ErrNoRows { + http.Error(w, "feed not found", http.StatusNotFound) + return + } + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if title.Valid { + f.Title = title.String + } + if description.Valid { + f.Description = description.String + } + if language.Valid { + f.Language = language.String + } + if siteUrl.Valid { + f.SiteURL = siteUrl.String + } + if lastCrawledAt.Valid { + f.LastCrawledAt = lastCrawledAt.String + } + if lastBuildDate.Valid { + f.LastBuildDate = lastBuildDate.String + } + if ttlMinutes.Valid { + f.TTLMinutes = int(ttlMinutes.Int64) + } + if updatePeriod.Valid { + f.UpdatePeriod = updatePeriod.String + } + if updateFreq.Valid { + f.UpdateFreq = int(updateFreq.Int64) + } + if status.Valid { + f.Status = status.String + } + if errorCount.Valid { + f.ErrorCount = int(errorCount.Int64) + } + if lastError.Valid { + f.LastError = lastError.String + } + if itemCount.Valid { + f.ItemCount = int(itemCount.Int64) + } + if avgPostFreqHrs.Valid { + f.AvgPostFreqHrs = avgPostFreqHrs.Float64 + } + if oldestItemDate.Valid { + f.OldestItemDate = oldestItemDate.String + } + if newestItemDate.Valid { + f.NewestItemDate = newestItemDate.String + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(f) +} + +func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + limit := 50 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 100 { + limit = 100 + } + } + + items, err := c.GetItemsByFeed(feedURL, limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if items == nil { + items = []*Item{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(items) +} + +// SearchResult represents a search result with feed and matching items +type SearchResult struct { + Feed SearchFeed `json:"feed"` + Items []SearchItem `json:"items"` +} + +type SearchFeed struct { + URL string `json:"url"` + Title string `json:"title"` + Description string `json:"description"` + Type string `json:"type"` + SourceHost string `json:"source_host"` + Status string `json:"status"` +} + +type SearchItem struct { + ID int64 `json:"id"` + Title string `json:"title"` + Link string `json:"link"` + Description string `json:"description"` + Author string `json:"author"` + PubDate string `json:"pub_date"` +} + +func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { + query := r.URL.Query().Get("q") + if query == "" { + http.Error(w, "q parameter required", http.StatusBadRequest) + return + } + + limit := 100 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + + // Results map: feedURL -> SearchResult + results := make(map[string]*SearchResult) + + // Search feeds + feedRows, err := c.db.Query(` + SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status + FROM feeds f + JOIN feeds_fts fts ON f.rowid = fts.rowid + WHERE feeds_fts MATCH ? + LIMIT ? + `, query, limit) + if err == nil { + defer feedRows.Close() + for feedRows.Next() { + var url string + var title, description, feedType, sourceHost, status sql.NullString + if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil { + continue + } + results[url] = &SearchResult{ + Feed: SearchFeed{ + URL: url, + Title: title.String, + Description: description.String, + Type: feedType.String, + SourceHost: sourceHost.String, + Status: status.String, + }, + Items: []SearchItem{}, + } + } + } + + // Search items + itemRows, err := c.db.Query(` + SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate + FROM items i + JOIN items_fts fts ON i.id = fts.rowid + WHERE items_fts MATCH ? + ORDER BY i.pubDate DESC + LIMIT ? + `, query, limit) + if err == nil { + defer itemRows.Close() + for itemRows.Next() { + var id int64 + var feedUrl string + var title, link, description, author, pubDate sql.NullString + if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil { + continue + } + + item := SearchItem{ + ID: id, + Title: title.String, + Link: link.String, + Description: description.String, + Author: author.String, + PubDate: pubDate.String, + } + + // Add to existing result or create new one + if result, exists := results[feedUrl]; exists { + result.Items = append(result.Items, item) + } else { + // Fetch feed info for this item's feed + var fTitle, fDesc, fType, fHost, fStatus sql.NullString + c.db.QueryRow(` + SELECT title, description, type, sourceHost, status + FROM feeds WHERE url = ? + `, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus) + + results[feedUrl] = &SearchResult{ + Feed: SearchFeed{ + URL: feedUrl, + Title: fTitle.String, + Description: fDesc.String, + Type: fType.String, + SourceHost: fHost.String, + Status: fStatus.String, + }, + Items: []SearchItem{item}, + } + } + } + } + + // Convert map to slice + var resultList []SearchResult + for _, r := range results { + resultList = append(resultList, *r) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resultList) +} + func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { stats, err := c.GetDashboardStats() if err != nil { @@ -228,14 +620,28 @@ func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { } funcMap := template.FuncMap{ - "divf": func(a, b int) float64 { + "pct": func(a, b int) float64 { if b == 0 { return 0 } - return float64(a) / float64(b) + return float64(a) * 100.0 / float64(b) }, - "mulf": func(a int, b float64) float64 { - return float64(a) * b + "comma": func(n interface{}) string { + var val int + switch v := n.(type) { + case int: + val = v + case int32: + val = int(v) + case int64: + val = int(v) + default: + return "0" + } + if val < 0 { + return "-" + commaFormat(-val) + } + return commaFormat(val) }, } @@ -265,58 +671,8 @@ const dashboardHTML = ` 1440.news Feed Crawler - - + +

1440.news Feed Crawler

@@ -324,99 +680,63 @@ const dashboardHTML = `

Crawl Progress

-
{{.TotalDomains}}
-
Total Domains
+
{{comma .TotalDomains}}
+
Domains
-
{{.CrawledDomains}}
-
Crawled
- {{if .TotalDomains}} +
{{comma .CheckedDomains}}
+
Checked
-
+
- {{end}}
-
{{.UncrawledDomains}}
-
Uncrawled
+
{{comma .UncheckedDomains}}
+
Unchecked
-
{{.ErrorDomains}}
-
Errors
+
{{comma .CrawlRate}}
+
crawls per min
+
+
+
{{comma .CheckRate}}
+
checks per min

Feeds Discovered

-
{{.TotalFeeds}}
+
{{comma .TotalFeeds}}
Total Feeds
-
{{.RSSFeeds}}
+
{{comma .RSSFeeds}}
RSS Feeds
-
{{.AtomFeeds}}
+
{{comma .AtomFeeds}}
Atom Feeds
-
{{.UnknownFeeds}}
+
{{comma .UnknownFeeds}}
Unknown Type
-
-
-

Top TLDs

- {{range .TopTLDs}} -
- .{{.TLD}} - {{.Count}} -
- {{else}} -
No data yet
- {{end}} -
-
-

Top Domains

- {{range .TopDomains}} -
- {{.Host}} - {{.FeedsFound}} -
- {{else}} -
No data yet
- {{end}} -
-
- -

Recent Feeds

- - - - - - - - - - - {{range .RecentFeeds}} - - - - - - - {{else}} - - {{end}} - -
URLTitleTypeDiscovered
{{.URL}}{{if .Title}}{{.Title}}{{else}}-{{end}}{{.Type}}{{.DiscoveredAt.Format "15:04:05"}}
No feeds discovered yet
+

Feeds

+
+ +
+ +
+
+
Loading...
+
-
Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}
+
Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}
` diff --git a/db.go b/db.go new file mode 100644 index 0000000..0e49fd2 --- /dev/null +++ b/db.go @@ -0,0 +1,192 @@ +package main + +import ( + "database/sql" + "fmt" + + _ "modernc.org/sqlite" +) + +const schema = ` +CREATE TABLE IF NOT EXISTS domains ( + host TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'unchecked', + discoveredAt DATETIME NOT NULL, + lastCrawledAt DATETIME, + feedsFound INTEGER DEFAULT 0, + lastError TEXT, + tld TEXT +); + +CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status); +CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld); +CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0; + +CREATE TABLE IF NOT EXISTS feeds ( + url TEXT PRIMARY KEY, + type TEXT, + title TEXT, + description TEXT, + language TEXT, + siteUrl TEXT, + + discoveredAt DATETIME NOT NULL, + lastCrawledAt DATETIME, + nextCrawlAt DATETIME, + lastBuildDate DATETIME, + + etag TEXT, + lastModified TEXT, + + ttlMinutes INTEGER, + updatePeriod TEXT, + updateFreq INTEGER, + + status TEXT DEFAULT 'active', + errorCount INTEGER DEFAULT 0, + lastError TEXT, + lastErrorAt DATETIME, + + sourceUrl TEXT, + sourceHost TEXT, + tld TEXT, + + itemCount INTEGER, + avgPostFreqHrs REAL, + oldestItemDate DATETIME, + newestItemDate DATETIME, + + noUpdate INTEGER DEFAULT 0 +); + +CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost); +CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url); +CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld); +CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost); +CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type); +CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); +CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt); +CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); + +CREATE TABLE IF NOT EXISTS items ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feedUrl TEXT NOT NULL, + guid TEXT, + title TEXT, + link TEXT, + description TEXT, + content TEXT, + author TEXT, + pubDate DATETIME, + discoveredAt DATETIME NOT NULL, + updatedAt DATETIME, + UNIQUE(feedUrl, guid) +); + +CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl); +CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC); +CREATE INDEX IF NOT EXISTS idx_items_link ON items(link); +CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC); + +-- Full-text search for feeds +CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5( + url, + title, + description, + content='feeds', + content_rowid='rowid' +); + +-- Triggers to keep FTS in sync +CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN + INSERT INTO feeds_fts(rowid, url, title, description) + VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description); +END; + +CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN + INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description) + VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description); +END; + +CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN + INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description) + VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description); + INSERT INTO feeds_fts(rowid, url, title, description) + VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description); +END; + +-- Full-text search for items +CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5( + title, + description, + content, + author, + content='items', + content_rowid='id' +); + +-- Triggers to keep items FTS in sync +CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN + INSERT INTO items_fts(rowid, title, description, content, author) + VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author); +END; + +CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN + INSERT INTO items_fts(items_fts, rowid, title, description, content, author) + VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author); +END; + +CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN + INSERT INTO items_fts(items_fts, rowid, title, description, content, author) + VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author); + INSERT INTO items_fts(rowid, title, description, content, author) + VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author); +END; +` + +func OpenDatabase(dbPath string) (*sql.DB, error) { + fmt.Printf("Opening database: %s\n", dbPath) + + // Use pragmas in connection string for consistent application + connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)" + db, err := sql.Open("sqlite", connStr) + if err != nil { + return nil, fmt.Errorf("failed to open database: %v", err) + } + + // Allow multiple readers (WAL mode supports concurrent reads) + // SQLite is single-writer, but reads can happen concurrently + db.SetMaxOpenConns(4) + + // Verify connection and show journal mode + var journalMode string + if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil { + fmt.Printf(" Warning: could not query journal_mode: %v\n", err) + } else { + fmt.Printf(" Journal mode: %s\n", journalMode) + } + + // Create schema + if _, err := db.Exec(schema); err != nil { + db.Close() + return nil, fmt.Errorf("failed to create schema: %v", err) + } + fmt.Println(" Schema OK") + + // Run stats and ANALYZE in background to avoid blocking startup with large databases + go func() { + var domainCount, feedCount int + db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount) + db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount) + fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount) + + fmt.Println(" Running ANALYZE...") + if _, err := db.Exec("ANALYZE"); err != nil { + fmt.Printf(" Warning: ANALYZE failed: %v\n", err) + } else { + fmt.Println(" ANALYZE complete") + } + }() + + return db, nil +} diff --git a/domain.go b/domain.go index 177f970..d4197fb 100644 --- a/domain.go +++ b/domain.go @@ -3,20 +3,19 @@ package main import ( "bufio" "compress/gzip" - "encoding/json" + "database/sql" "fmt" "io" "os" "strings" + "sync/atomic" "time" - - "github.com/cockroachdb/pebble" ) // Domain represents a host to be crawled for feeds type Domain struct { - Host string `json:"host"` // Normalized hostname (no scheme, no www.) - Status string `json:"status"` // "uncrawled", "crawled", "error" + Host string `json:"host"` + Status string `json:"status"` DiscoveredAt time.Time `json:"discovered_at"` LastCrawledAt time.Time `json:"last_crawled_at,omitempty"` FeedsFound int `json:"feeds_found,omitempty"` @@ -24,130 +23,162 @@ type Domain struct { TLD string `json:"tld,omitempty"` } -// saveDomain stores a domain in PebbleDB +// saveDomain stores a domain in SQLite func (c *Crawler) saveDomain(domain *Domain) error { - data, err := json.Marshal(domain) - if err != nil { - return fmt.Errorf("failed to marshal domain: %v", err) - } - - key := []byte("domain:" + domain.Host) - return c.db.Set(key, data, pebble.Sync) + _, err := c.db.Exec(` + INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(host) DO UPDATE SET + status = excluded.status, + lastCrawledAt = excluded.lastCrawledAt, + feedsFound = excluded.feedsFound, + lastError = excluded.lastError, + tld = excluded.tld + `, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt), + domain.FeedsFound, nullString(domain.LastError), domain.TLD) + return err } -// getDomain retrieves a domain from PebbleDB -func (c *Crawler) getDomain(host string) (*Domain, error) { - key := []byte("domain:" + normalizeHost(host)) - data, closer, err := c.db.Get(key) - if err != nil { - if err == pebble.ErrNotFound { - return nil, nil - } - return nil, err - } - defer closer.Close() - - var domain Domain - if err := json.Unmarshal(data, &domain); err != nil { - return nil, fmt.Errorf("failed to unmarshal domain: %v", err) - } - return &domain, nil +// saveDomainTx stores a domain using a transaction +func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error { + _, err := tx.Exec(` + INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(host) DO NOTHING + `, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt), + domain.FeedsFound, nullString(domain.LastError), domain.TLD) + return err } // domainExists checks if a domain already exists in the database func (c *Crawler) domainExists(host string) bool { - key := []byte("domain:" + normalizeHost(host)) - _, closer, err := c.db.Get(key) - if err != nil { - return false - } - closer.Close() - return true + var exists bool + err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists) + return err == nil && exists } -// GetUncrawledDomains returns all domains with status "uncrawled" -func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) { - var domains []*Domain +// getDomain retrieves a domain from SQLite +func (c *Crawler) getDomain(host string) (*Domain, error) { + domain := &Domain{} + var lastCrawledAt sql.NullTime + var lastError sql.NullString - iter, err := c.db.NewIter(&pebble.IterOptions{ - LowerBound: []byte("domain:"), - UpperBound: []byte("domain:\xff"), - }) + err := c.db.QueryRow(` + SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld + FROM domains WHERE host = ? + `, normalizeHost(host)).Scan( + &domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt, + &domain.FeedsFound, &lastError, &domain.TLD, + ) + + if err == sql.ErrNoRows { + return nil, nil + } if err != nil { return nil, err } - defer iter.Close() - for iter.First(); iter.Valid(); iter.Next() { - var domain Domain - if err := json.Unmarshal(iter.Value(), &domain); err != nil { - continue - } - if domain.Status == "uncrawled" { - domains = append(domains, &domain) - } + if lastCrawledAt.Valid { + domain.LastCrawledAt = lastCrawledAt.Time + } + if lastError.Valid { + domain.LastError = lastError.String } - if err := iter.Error(); err != nil { + return domain, nil +} + +// GetUncheckedDomains returns all domains with status "unchecked" +func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) { + rows, err := c.db.Query(` + SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld + FROM domains WHERE status = 'unchecked' + `) + if err != nil { return nil, err } + defer rows.Close() - return domains, nil + return c.scanDomains(rows) +} + +// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order +func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) { + rows, err := c.db.Query(` + SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld + FROM domains WHERE status = 'unchecked' + ORDER BY RANDOM() + LIMIT ? + `, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + return c.scanDomains(rows) +} + +// scanDomains is a helper to scan multiple domain rows +func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) { + var domains []*Domain + for rows.Next() { + domain := &Domain{} + var lastCrawledAt sql.NullTime + var lastError sql.NullString + + if err := rows.Scan( + &domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt, + &domain.FeedsFound, &lastError, &domain.TLD, + ); err != nil { + continue + } + + if lastCrawledAt.Valid { + domain.LastCrawledAt = lastCrawledAt.Time + } + if lastError.Valid { + domain.LastError = lastError.String + } + + domains = append(domains, domain) + } + + return domains, rows.Err() } // markDomainCrawled updates a domain's status after crawling func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error { - domain, err := c.getDomain(host) - if err != nil { - return err - } - if domain == nil { - return fmt.Errorf("domain not found: %s", host) - } - - domain.LastCrawledAt = time.Now() - domain.FeedsFound = feedsFound + status := "checked" if lastError != "" { - domain.Status = "error" - domain.LastError = lastError - } else { - domain.Status = "crawled" - domain.LastError = "" + status = "error" } - return c.saveDomain(domain) + var err error + if lastError != "" { + _, err = c.db.Exec(` + UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ? + WHERE host = ? + `, status, time.Now(), feedsFound, lastError, normalizeHost(host)) + } else { + _, err = c.db.Exec(` + UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL + WHERE host = ? + `, status, time.Now(), feedsFound, normalizeHost(host)) + } + return err } // GetDomainCount returns the total number of domains in the database -func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) { - iter, err := c.db.NewIter(&pebble.IterOptions{ - LowerBound: []byte("domain:"), - UpperBound: []byte("domain:\xff"), - }) +func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) { + err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total) if err != nil { return 0, 0, err } - defer iter.Close() - - for iter.First(); iter.Valid(); iter.Next() { - total++ - var domain Domain - if err := json.Unmarshal(iter.Value(), &domain); err != nil { - continue - } - if domain.Status == "uncrawled" { - uncrawled++ - } - } - - if err := iter.Error(); err != nil { - return 0, 0, err - } - - return total, uncrawled, nil + err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked) + return total, unchecked, err } -// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled" +// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked" func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) { file, err := os.Open(filename) if err != nil { @@ -158,6 +189,110 @@ func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported in return c.parseAndStoreDomains(file, limit) } +// ImportDomainsInBackground starts domain import in a background goroutine +func (c *Crawler) ImportDomainsInBackground(filename string) { + go func() { + file, err := os.Open(filename) + if err != nil { + fmt.Printf("Failed to open vertices file: %v\n", err) + return + } + defer file.Close() + + var bodyReader io.Reader + + bufReader := bufio.NewReader(file) + peekBytes, err := bufReader.Peek(2) + if err != nil && err != io.EOF { + fmt.Printf("Failed to peek at file: %v\n", err) + return + } + + if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b { + gzReader, err := gzip.NewReader(bufReader) + if err != nil { + fmt.Printf("Failed to create gzip reader: %v\n", err) + return + } + defer gzReader.Close() + bodyReader = gzReader + } else { + bodyReader = bufReader + } + + scanner := bufio.NewScanner(bodyReader) + buf := make([]byte, 0, 64*1024) + scanner.Buffer(buf, 1024*1024) + + const batchSize = 10000 + now := time.Now() + nowStr := now.Format("2006-01-02 15:04:05") + totalImported := 0 + batchCount := 0 + + type domainEntry struct { + host string + tld string + } + + for { + // Read and canonicalize batch + var domains []domainEntry + for len(domains) < batchSize && scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, "\t") + if len(parts) >= 2 { + reverseHostName := strings.TrimSpace(parts[1]) + if reverseHostName != "" { + host := normalizeHost(reverseHost(reverseHostName)) + domains = append(domains, domainEntry{host: host, tld: getTLD(host)}) + } + } + } + + if len(domains) == 0 { + break + } + + // Build bulk INSERT statement + var sb strings.Builder + sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ") + args := make([]interface{}, 0, len(domains)*4) + for i, d := range domains { + if i > 0 { + sb.WriteString(",") + } + sb.WriteString("(?, 'unchecked', ?, ?)") + args = append(args, d.host, nowStr, d.tld) + } + sb.WriteString(" ON CONFLICT(host) DO NOTHING") + + // Execute bulk insert + result, err := c.db.Exec(sb.String(), args...) + imported := 0 + if err != nil { + fmt.Printf("Bulk insert error: %v\n", err) + } else { + rowsAffected, _ := result.RowsAffected() + imported = int(rowsAffected) + } + + batchCount++ + totalImported += imported + atomic.AddInt32(&c.domainsImported, int32(imported)) + + // Wait 1 second before the next batch + time.Sleep(1 * time.Second) + } + + if err := scanner.Err(); err != nil { + fmt.Printf("Error reading vertices file: %v\n", err) + } + + fmt.Printf("Background import complete: %d domains imported\n", totalImported) + }() +} + func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) { var bodyReader io.Reader @@ -183,39 +318,63 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in scanner.Buffer(buf, 1024*1024) now := time.Now() + nowStr := now.Format("2006-01-02 15:04:05") count := 0 + const batchSize = 1000 - for scanner.Scan() { - if limit > 0 && count >= limit { + type domainEntry struct { + host string + tld string + } + + for { + // Read and canonicalize batch + var domains []domainEntry + for len(domains) < batchSize && scanner.Scan() { + if limit > 0 && count >= limit { + break + } + line := scanner.Text() + parts := strings.Split(line, "\t") + if len(parts) >= 2 { + reverseHostName := strings.TrimSpace(parts[1]) + if reverseHostName != "" { + host := normalizeHost(reverseHost(reverseHostName)) + domains = append(domains, domainEntry{host: host, tld: getTLD(host)}) + count++ + } + } + } + + if len(domains) == 0 { break } - line := scanner.Text() - parts := strings.Split(line, "\t") - if len(parts) >= 2 { - reverseHostName := strings.TrimSpace(parts[1]) - if reverseHostName != "" { - host := normalizeHost(reverseHost(reverseHostName)) - count++ - - // Skip if domain already exists - if c.domainExists(host) { - skipped++ - continue - } - - // Store new domain as uncrawled - domain := &Domain{ - Host: host, - Status: "uncrawled", - DiscoveredAt: now, - TLD: getTLD(host), - } - if err := c.saveDomain(domain); err != nil { - continue - } - imported++ + // Build bulk INSERT statement + var sb strings.Builder + sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ") + args := make([]interface{}, 0, len(domains)*4) + for i, d := range domains { + if i > 0 { + sb.WriteString(",") } + sb.WriteString("(?, 'unchecked', ?, ?)") + args = append(args, d.host, nowStr, d.tld) + } + sb.WriteString(" ON CONFLICT(host) DO NOTHING") + + // Execute bulk insert + result, execErr := c.db.Exec(sb.String(), args...) + if execErr != nil { + skipped += len(domains) + continue + } + rowsAffected, _ := result.RowsAffected() + imported += int(rowsAffected) + skipped += len(domains) - int(rowsAffected) + + if limit > 0 && count >= limit { + break } } @@ -225,3 +384,18 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in return imported, skipped, nil } + +// Helper functions for SQL null handling +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} + +func nullString(s string) sql.NullString { + if s == "" { + return sql.NullString{} + } + return sql.NullString{String: s, Valid: true} +} diff --git a/feed.go b/feed.go index e8949a1..cb83d74 100644 --- a/feed.go +++ b/feed.go @@ -1,15 +1,86 @@ package main import ( - "encoding/json" + "database/sql" "fmt" + "io" "net/http" + "net/url" + "regexp" "strings" + "sync/atomic" "time" - - "github.com/cockroachdb/pebble" ) +// shouldSkipFeed checks if a feed URL should be filtered out +// Returns true (and a reason) if the feed should be skipped +func shouldSkipFeed(feedURL string) (bool, string) { + lower := strings.ToLower(feedURL) + + // Skip explicit comment feeds + if strings.Contains(lower, "/comment") { + return true, "comment feed" + } + + u, err := url.Parse(feedURL) + if err != nil { + return false, "" + } + + path := strings.ToLower(strings.TrimSuffix(u.Path, "/")) + + // Skip category/tag feeds + categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"} + for _, pattern := range categoryPatterns { + if strings.Contains(path, pattern) { + return true, "category/tag feed" + } + } + + // Check for article comment feeds (path ending in /feed with content before it) + if strings.HasSuffix(path, "/feed") { + basePath := strings.TrimSuffix(path, "/feed") + basePath = strings.Trim(basePath, "/") + + if basePath == "" { + return false, "" // Just /feed - legitimate main feed + } + + // Skip if path contains date patterns (likely article) + if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched { + return true, "article feed (date pattern)" + } + + // Skip if path has multiple segments (likely article or nested content) + segments := strings.Split(basePath, "/") + if len(segments) >= 2 { + return true, "article feed (nested path)" + } + + // Skip if single segment looks like an article slug (contains hyphens, is long) + if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) { + return true, "article feed (slug pattern)" + } + } + + return false, "" +} + +// Item represents an individual entry/article from a feed +type Item struct { + ID int64 `json:"id,omitempty"` + FeedURL string `json:"feed_url"` + GUID string `json:"guid,omitempty"` + Title string `json:"title,omitempty"` + Link string `json:"link,omitempty"` + Description string `json:"description,omitempty"` + Content string `json:"content,omitempty"` + Author string `json:"author,omitempty"` + PubDate time.Time `json:"pub_date,omitempty"` + DiscoveredAt time.Time `json:"discovered_at"` + UpdatedAt time.Time `json:"updated_at,omitempty"` +} + // Feed represents a discovered RSS/Atom feed with metadata type Feed struct { URL string `json:"url"` @@ -50,99 +121,548 @@ type Feed struct { AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts OldestItemDate time.Time `json:"oldest_item_date,omitempty"` NewestItemDate time.Time `json:"newest_item_date,omitempty"` + + // Adaptive check interval + NoUpdate int `json:"no_update"` // Consecutive checks with no change } -// saveFeed stores a feed in PebbleDB +// saveFeed stores a feed in SQLite func (c *Crawler) saveFeed(feed *Feed) error { - data, err := json.Marshal(feed) - if err != nil { - return fmt.Errorf("failed to marshal feed: %v", err) - } - - key := []byte("feed:" + feed.URL) - return c.db.Set(key, data, pebble.Sync) + _, err := c.db.Exec(` + INSERT INTO feeds ( + url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + type = excluded.type, + title = excluded.title, + description = excluded.description, + language = excluded.language, + siteUrl = excluded.siteUrl, + lastCrawledAt = excluded.lastCrawledAt, + nextCrawlAt = excluded.nextCrawlAt, + lastBuildDate = excluded.lastBuildDate, + etag = excluded.etag, + lastModified = excluded.lastModified, + ttlMinutes = excluded.ttlMinutes, + updatePeriod = excluded.updatePeriod, + updateFreq = excluded.updateFreq, + status = excluded.status, + errorCount = excluded.errorCount, + lastError = excluded.lastError, + lastErrorAt = excluded.lastErrorAt, + itemCount = excluded.itemCount, + avgPostFreqHrs = excluded.avgPostFreqHrs, + oldestItemDate = excluded.oldestItemDate, + newestItemDate = excluded.newestItemDate, + noUpdate = excluded.noUpdate + `, + feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description), + nullString(feed.Language), nullString(feed.SiteURL), + feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate), + nullString(feed.ETag), nullString(feed.LastModified), + feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq, + feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt), + nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD), + feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate), + feed.NoUpdate, + ) + return err } -// getFeed retrieves a feed from PebbleDB +// getFeed retrieves a feed from SQLite func (c *Crawler) getFeed(feedURL string) (*Feed, error) { - key := []byte("feed:" + normalizeURL(feedURL)) - data, closer, err := c.db.Get(key) + feed := &Feed{} + var title, description, language, siteURL sql.NullString + var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime + var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString + var avgPostFreqHrs sql.NullFloat64 + + err := c.db.QueryRow(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate + FROM feeds WHERE url = ? + `, normalizeURL(feedURL)).Scan( + &feed.URL, &feed.Type, &title, &description, &language, &siteURL, + &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &etag, &lastModified, + &feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq, + &feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt, + &sourceURL, &sourceHost, &tld, + &feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + &feed.NoUpdate, + ) + + if err == sql.ErrNoRows { + return nil, nil + } if err != nil { - if err == pebble.ErrNotFound { - return nil, nil - } return nil, err } - defer closer.Close() - var feed Feed - if err := json.Unmarshal(data, &feed); err != nil { - return nil, fmt.Errorf("failed to unmarshal feed: %v", err) + // Handle nullable fields + if title.Valid { + feed.Title = title.String } - return &feed, nil + if description.Valid { + feed.Description = description.String + } + if language.Valid { + feed.Language = language.String + } + if siteURL.Valid { + feed.SiteURL = siteURL.String + } + if lastCrawledAt.Valid { + feed.LastCrawledAt = lastCrawledAt.Time + } + if nextCrawlAt.Valid { + feed.NextCrawlAt = nextCrawlAt.Time + } + if lastBuildDate.Valid { + feed.LastBuildDate = lastBuildDate.Time + } + if etag.Valid { + feed.ETag = etag.String + } + if lastModified.Valid { + feed.LastModified = lastModified.String + } + if updatePeriod.Valid { + feed.UpdatePeriod = updatePeriod.String + } + if lastError.Valid { + feed.LastError = lastError.String + } + if lastErrorAt.Valid { + feed.LastErrorAt = lastErrorAt.Time + } + if sourceURL.Valid { + feed.SourceURL = sourceURL.String + } + if sourceHost.Valid { + feed.SourceHost = sourceHost.String + } + if tld.Valid { + feed.TLD = tld.String + } + if avgPostFreqHrs.Valid { + feed.AvgPostFreqHrs = avgPostFreqHrs.Float64 + } + if oldestItemDate.Valid { + feed.OldestItemDate = oldestItemDate.Time + } + if newestItemDate.Valid { + feed.NewestItemDate = newestItemDate.Time + } + + return feed, nil } // feedExists checks if a feed URL already exists in the database func (c *Crawler) feedExists(feedURL string) bool { - key := []byte("feed:" + normalizeURL(feedURL)) - _, closer, err := c.db.Get(key) - if err != nil { - return false - } - closer.Close() - return true + var exists bool + err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists) + return err == nil && exists } // GetAllFeeds returns all feeds from the database func (c *Crawler) GetAllFeeds() ([]*Feed, error) { - var feeds []*Feed - - iter, err := c.db.NewIter(&pebble.IterOptions{ - LowerBound: []byte("feed:"), - UpperBound: []byte("feed:\xff"), - }) + rows, err := c.db.Query(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate + FROM feeds + `) if err != nil { return nil, err } - defer iter.Close() + defer rows.Close() - for iter.First(); iter.Valid(); iter.Next() { - var feed Feed - if err := json.Unmarshal(iter.Value(), &feed); err != nil { - continue - } - feeds = append(feeds, &feed) - } - - if err := iter.Error(); err != nil { - return nil, err - } - - return feeds, nil + return scanFeeds(rows) } // GetFeedCount returns the total number of feeds in the database func (c *Crawler) GetFeedCount() (int, error) { - count := 0 + var count int + err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count) + return count, err +} - iter, err := c.db.NewIter(&pebble.IterOptions{ - LowerBound: []byte("feed:"), - UpperBound: []byte("feed:\xff"), - }) +// GetFeedCountByHost returns the number of feeds for a specific host +func (c *Crawler) GetFeedCountByHost(host string) (int, error) { + var count int + err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count) + return count, err +} + +// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n +func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { + rows, err := c.db.Query(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate + FROM feeds + WHERE nextCrawlAt <= datetime('now') AND status != 'dead' + ORDER BY RANDOM() + LIMIT ? + `, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanFeeds(rows) +} + +// GetFeedsByHost returns all feeds from a specific host +func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { + rows, err := c.db.Query(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate, + etag, lastModified, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, lastErrorAt, + sourceUrl, sourceHost, tld, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate, + noUpdate + FROM feeds WHERE sourceHost = ? + `, host) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanFeeds(rows) +} + +// SearchFeeds performs a full-text search on feeds +func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { + rows, err := c.db.Query(` + SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl, + f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate, + f.etag, f.lastModified, + f.ttlMinutes, f.updatePeriod, f.updateFreq, + f.status, f.errorCount, f.lastError, f.lastErrorAt, + f.sourceUrl, f.sourceHost, f.tld, + f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate, + f.noUpdate + FROM feeds f + JOIN feeds_fts fts ON f.rowid = fts.rowid + WHERE feeds_fts MATCH ? + `, query) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanFeeds(rows) +} + +// scanFeeds is a helper to scan multiple feed rows +func scanFeeds(rows *sql.Rows) ([]*Feed, error) { + var feeds []*Feed + + for rows.Next() { + feed := &Feed{} + var title, description, language, siteURL sql.NullString + var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime + var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString + var avgPostFreqHrs sql.NullFloat64 + + if err := rows.Scan( + &feed.URL, &feed.Type, &title, &description, &language, &siteURL, + &feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate, + &etag, &lastModified, + &feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq, + &feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt, + &sourceURL, &sourceHost, &tld, + &feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + &feed.NoUpdate, + ); err != nil { + continue + } + + // Handle nullable fields + if title.Valid { + feed.Title = title.String + } + if description.Valid { + feed.Description = description.String + } + if language.Valid { + feed.Language = language.String + } + if siteURL.Valid { + feed.SiteURL = siteURL.String + } + if lastCrawledAt.Valid { + feed.LastCrawledAt = lastCrawledAt.Time + } + if nextCrawlAt.Valid { + feed.NextCrawlAt = nextCrawlAt.Time + } + if lastBuildDate.Valid { + feed.LastBuildDate = lastBuildDate.Time + } + if etag.Valid { + feed.ETag = etag.String + } + if lastModified.Valid { + feed.LastModified = lastModified.String + } + if updatePeriod.Valid { + feed.UpdatePeriod = updatePeriod.String + } + if lastError.Valid { + feed.LastError = lastError.String + } + if lastErrorAt.Valid { + feed.LastErrorAt = lastErrorAt.Time + } + if sourceURL.Valid { + feed.SourceURL = sourceURL.String + } + if sourceHost.Valid { + feed.SourceHost = sourceHost.String + } + if tld.Valid { + feed.TLD = tld.String + } + if avgPostFreqHrs.Valid { + feed.AvgPostFreqHrs = avgPostFreqHrs.Float64 + } + if oldestItemDate.Valid { + feed.OldestItemDate = oldestItemDate.Time + } + if newestItemDate.Valid { + feed.NewestItemDate = newestItemDate.Time + } + + feeds = append(feeds, feed) + } + + return feeds, rows.Err() +} + +// saveItem stores an item in SQLite (upsert by feedUrl + guid) +func (c *Crawler) saveItem(item *Item) error { + _, err := c.db.Exec(` + INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(feedUrl, guid) DO UPDATE SET + title = excluded.title, + link = excluded.link, + description = excluded.description, + content = excluded.content, + author = excluded.author, + pubDate = excluded.pubDate, + updatedAt = excluded.updatedAt + `, + item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link), + nullString(item.Description), nullString(item.Content), nullString(item.Author), + nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt), + ) + return err +} + +// saveItems stores multiple items efficiently +func (c *Crawler) saveItems(items []*Item) error { + if len(items) == 0 { + return nil + } + + tx, err := c.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + + stmt, err := tx.Prepare(` + INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(feedUrl, guid) DO UPDATE SET + title = excluded.title, + link = excluded.link, + description = excluded.description, + content = excluded.content, + author = excluded.author, + pubDate = excluded.pubDate, + updatedAt = excluded.updatedAt + `) + if err != nil { + return err + } + defer stmt.Close() + + for _, item := range items { + if item == nil || item.GUID == "" { + continue // Skip nil items or items without GUID + } + _, err := stmt.Exec( + item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link), + nullString(item.Description), nullString(item.Content), nullString(item.Author), + nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt), + ) + if err != nil { + continue // Skip failed items + } + } + + return tx.Commit() +} + +// GetItemsByFeed returns all items for a specific feed +func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) { + rows, err := c.db.Query(` + SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt + FROM items + WHERE feedUrl = ? + ORDER BY pubDate DESC + LIMIT ? + `, feedURL, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var items []*Item + for rows.Next() { + item := &Item{} + var guid, title, link, description, content, author sql.NullString + var pubDate, updatedAt sql.NullTime + + if err := rows.Scan( + &item.ID, &item.FeedURL, &guid, &title, &link, + &description, &content, &author, &pubDate, + &item.DiscoveredAt, &updatedAt, + ); err != nil { + continue + } + + if guid.Valid { + item.GUID = guid.String + } + if title.Valid { + item.Title = title.String + } + if link.Valid { + item.Link = link.String + } + if description.Valid { + item.Description = description.String + } + if content.Valid { + item.Content = content.String + } + if author.Valid { + item.Author = author.String + } + if pubDate.Valid { + item.PubDate = pubDate.Time + } + if updatedAt.Valid { + item.UpdatedAt = updatedAt.Time + } + + items = append(items, item) + } + + return items, rows.Err() +} + +// SearchItems performs a full-text search on items +func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) { + rows, err := c.db.Query(` + SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt + FROM items i + JOIN items_fts fts ON i.id = fts.rowid + WHERE items_fts MATCH ? + ORDER BY i.pubDate DESC + LIMIT ? + `, query, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var items []*Item + for rows.Next() { + item := &Item{} + var guid, title, link, description, content, author sql.NullString + var pubDate, updatedAt sql.NullTime + + if err := rows.Scan( + &item.ID, &item.FeedURL, &guid, &title, &link, + &description, &content, &author, &pubDate, + &item.DiscoveredAt, &updatedAt, + ); err != nil { + continue + } + + if guid.Valid { + item.GUID = guid.String + } + if title.Valid { + item.Title = title.String + } + if link.Valid { + item.Link = link.String + } + if description.Valid { + item.Description = description.String + } + if content.Valid { + item.Content = content.String + } + if author.Valid { + item.Author = author.String + } + if pubDate.Valid { + item.PubDate = pubDate.Time + } + if updatedAt.Valid { + item.UpdatedAt = updatedAt.Time + } + + items = append(items, item) + } + + return items, rows.Err() +} + +// CleanupOldItems removes items older than 12 months +func (c *Crawler) CleanupOldItems() (int64, error) { + cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago + result, err := c.db.Exec(` + DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL + `, cutoff) if err != nil { return 0, err } - defer iter.Close() - - for iter.First(); iter.Valid(); iter.Next() { - count++ - } - - if err := iter.Error(); err != nil { - return 0, err - } - - return count, nil + return result.RowsAffected() } // processFeed parses and stores a feed with full metadata @@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea LastModified: headers.Get("Last-Modified"), } - // Parse feed-specific metadata + // Parse feed-specific metadata and items + var items []*Item switch feedType { case "rss": - c.parseRSSMetadata(body, feed) + items = c.parseRSSMetadata(body, feed) case "atom": - c.parseAtomMetadata(body, feed) + items = c.parseAtomMetadata(body, feed) } // Calculate next crawl time @@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea if err := c.saveFeed(feed); err != nil { return } + + // Save items + if len(items) > 0 { + c.saveItems(items) + } } // addFeed adds a discovered feed URL (not yet fetched) func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { - if strings.Contains(feedURL, "/comment") { + // Skip comment, category, and article feeds + if skip, _ := shouldSkipFeed(feedURL); skip { return } @@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { return } } + +// CheckFeed performs a conditional request to check if a feed has been updated +// Returns: changed (bool), error +func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { + atomic.AddInt32(&c.feedsChecked, 1) + + // Try different scheme/www combinations since we store URLs without scheme + urlVariants := []string{ + "https://" + feed.URL, + "http://" + feed.URL, + "https://www." + feed.URL, + "http://www." + feed.URL, + } + + var resp *http.Response + var err error + var successURL string + + for _, tryURL := range urlVariants { + req, reqErr := http.NewRequest("GET", tryURL, nil) + if reqErr != nil { + continue + } + + req.Header.Set("User-Agent", c.UserAgent) + + // Add conditional headers if we have them + if feed.ETag != "" { + req.Header.Set("If-None-Match", feed.ETag) + } + if feed.LastModified != "" { + req.Header.Set("If-Modified-Since", feed.LastModified) + } + + resp, err = c.client.Do(req) + if err == nil { + successURL = tryURL + break + } + } + + _ = successURL // May be used later for logging/debugging + + // If no request succeeded, resp will be nil + if resp == nil { + if err == nil { + err = fmt.Errorf("all URL variants failed") + } + now := time.Now() + feed.LastCrawledAt = now + feed.ErrorCount++ + feed.NoUpdate++ + feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.LastError = err.Error() + feed.LastErrorAt = now + feed.Status = "error" + c.saveFeed(feed) + return false, err + } + defer resp.Body.Close() + + now := time.Now() + feed.LastCrawledAt = now + + // 304 Not Modified - feed hasn't changed + if resp.StatusCode == http.StatusNotModified { + feed.NoUpdate++ + // Adaptive backoff: 100s base + 100s per consecutive no-change + feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.ErrorCount = 0 + feed.LastError = "" + feed.Status = "active" + c.saveFeed(feed) + return false, nil + } + + // Non-200 response + if resp.StatusCode != http.StatusOK { + feed.ErrorCount++ + feed.NoUpdate++ + feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.LastError = resp.Status + feed.LastErrorAt = now + if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone { + feed.Status = "dead" + } else { + feed.Status = "error" + } + c.saveFeed(feed) + return false, nil + } + + // 200 OK - feed has new content + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + feed.ErrorCount++ + feed.NoUpdate++ + feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second) + feed.LastError = err.Error() + feed.LastErrorAt = now + feed.Status = "error" + c.saveFeed(feed) + return false, err + } + + body := string(bodyBytes) + + // Update cache headers + feed.ETag = resp.Header.Get("ETag") + feed.LastModified = resp.Header.Get("Last-Modified") + + // Re-detect type and parse metadata + feedType := c.detectFeedType(body) + feed.Type = feedType + + var items []*Item + switch feedType { + case "rss": + items = c.parseRSSMetadata(body, feed) + case "atom": + items = c.parseAtomMetadata(body, feed) + } + + // Content changed - reset backoff + feed.NoUpdate = 0 + feed.NextCrawlAt = now.Add(100 * time.Second) + feed.ErrorCount = 0 + feed.LastError = "" + feed.Status = "active" + c.saveFeed(feed) + + // Save items + if len(items) > 0 { + c.saveItems(items) + } + + return true, nil +} diff --git a/main.go b/main.go index 56f0a62..f552e5f 100644 --- a/main.go +++ b/main.go @@ -6,7 +6,13 @@ import ( ) func main() { - crawler, err := NewCrawler("feeds.db") + // Ensure feeds directory exists + if err := os.MkdirAll("feeds", 0755); err != nil { + fmt.Fprintf(os.Stderr, "Error creating feeds directory: %v\n", err) + os.Exit(1) + } + + crawler, err := NewCrawler("feeds/feeds.db") if err != nil { fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err) os.Exit(1) @@ -20,11 +26,24 @@ func main() { } }() - // Import domains from vertices file (only adds new ones as "uncrawled") - crawler.ImportDomainsFromFile("vertices.txt.gz", 0) + // Initialize stats in background (can be slow with large DBs) + go crawler.UpdateStats() - // Crawl all uncrawled domains (runs continuously) - for { - crawler.CrawlUncrawledDomains() - } + // Start all loops independently + fmt.Println("Starting import, crawl, check, and stats loops...") + + // Import loop (background) + go crawler.ImportDomainsInBackground("vertices.txt.gz") + + // Check loop (background) + go crawler.StartCheckLoop() + + // Stats loop (background) - updates once per minute + go crawler.StartStatsLoop() + + // Cleanup loop (background) - removes old items once per hour + go crawler.StartCleanupLoop() + + // Crawl loop (foreground - blocks forever) + crawler.StartCrawlLoop() } diff --git a/parser.go b/parser.go index d8f5b35..9b91798 100644 --- a/parser.go +++ b/parser.go @@ -26,9 +26,14 @@ type RSSChannel struct { } type RSSItem struct { - Title string `xml:"title"` - Link string `xml:"link"` - PubDate string `xml:"pubDate"` + Title string `xml:"title"` + Link string `xml:"link"` + GUID string `xml:"guid"` + Description string `xml:"description"` + Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + Author string `xml:"author"` + Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + PubDate string `xml:"pubDate"` } // Atom structs for parsing @@ -40,10 +45,23 @@ type AtomFeed struct { } type AtomEntry struct { - Title string `xml:"title"` - Links []AtomLink `xml:"link"` - Updated string `xml:"updated"` - Published string `xml:"published"` + ID string `xml:"id"` + Title string `xml:"title"` + Links []AtomLink `xml:"link"` + Summary string `xml:"summary"` + Content AtomContent `xml:"content"` + Author AtomAuthor `xml:"author"` + Updated string `xml:"updated"` + Published string `xml:"published"` +} + +type AtomContent struct { + Type string `xml:"type,attr"` + Value string `xml:",chardata"` +} + +type AtomAuthor struct { + Name string `xml:"name"` } type AtomLink struct { @@ -52,10 +70,10 @@ type AtomLink struct { Type string `xml:"type,attr"` } -func (c *Crawler) parseRSSMetadata(body string, feed *Feed) { +func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { var rss RSS if err := xml.Unmarshal([]byte(body), &rss); err != nil { - return + return nil } ch := rss.Channel @@ -75,16 +93,47 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) { } } - // Analyze item dates + // Parse items + now := time.Now() + var items []*Item var dates []time.Time - for _, item := range ch.Items { - if item.PubDate != "" { - if t, err := parseRSSDate(item.PubDate); err == nil { + + for _, rssItem := range ch.Items { + item := &Item{ + FeedURL: feed.URL, + Title: rssItem.Title, + Link: rssItem.Link, + Description: rssItem.Description, + Content: rssItem.Content, + DiscoveredAt: now, + } + + // Use GUID if available, otherwise use link + if rssItem.GUID != "" { + item.GUID = rssItem.GUID + } else if rssItem.Link != "" { + item.GUID = rssItem.Link + } + + // Author: prefer author, fall back to dc:creator + if rssItem.Author != "" { + item.Author = rssItem.Author + } else if rssItem.Creator != "" { + item.Author = rssItem.Creator + } + + // Parse pubDate + if rssItem.PubDate != "" { + if t, err := parseRSSDate(rssItem.PubDate); err == nil { + item.PubDate = t dates = append(dates, t) } } + + items = append(items, item) } + // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { @@ -103,12 +152,14 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) { feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) } } + + return items } -func (c *Crawler) parseAtomMetadata(body string, feed *Feed) { +func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item { var atom AtomFeed if err := xml.Unmarshal([]byte(body), &atom); err != nil { - return + return nil } feed.Title = atom.Title @@ -131,20 +182,60 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) { } } - // Analyze entry dates + // Parse entries + now := time.Now() + var items []*Item var dates []time.Time + for _, entry := range atom.Entries { + item := &Item{ + FeedURL: feed.URL, + Title: entry.Title, + Author: entry.Author.Name, + DiscoveredAt: now, + } + + // Use ID as GUID + if entry.ID != "" { + item.GUID = entry.ID + } + + // Get link (prefer alternate, fall back to first link) + for _, link := range entry.Links { + if link.Rel == "" || link.Rel == "alternate" { + item.Link = link.Href + break + } + } + if item.Link == "" && len(entry.Links) > 0 { + item.Link = entry.Links[0].Href + } + + // Use ID as GUID fallback if not set + if item.GUID == "" && item.Link != "" { + item.GUID = item.Link + } + + // Summary/Content + item.Description = entry.Summary + item.Content = entry.Content.Value + + // Parse dates dateStr := entry.Updated if dateStr == "" { dateStr = entry.Published } if dateStr != "" { if t, err := time.Parse(time.RFC3339, dateStr); err == nil { + item.PubDate = t dates = append(dates, t) } } + + items = append(items, item) } + // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { @@ -163,6 +254,8 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) { feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) } } + + return items } // parseRSSDate attempts to parse various RSS date formats diff --git a/static/dashboard.css b/static/dashboard.css new file mode 100644 index 0000000..c5668cc --- /dev/null +++ b/static/dashboard.css @@ -0,0 +1,55 @@ +* { box-sizing: border-box; margin: 0; padding: 0; } +body { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace; + background: #0a0a0a; + color: #ffffff; + padding: 20px; + line-height: 1.6; +} +h1 { color: #ffffff; margin-bottom: 20px; font-size: 24px; } +h2 { color: #ffffff; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; } +.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; } +.card { + background: #151515; + border: 1px solid #252525; + border-radius: 8px; + padding: 15px; +} +.stat-value { font-size: 32px; font-weight: bold; color: #ffffff; } +.stat-label { font-size: 12px; color: #ffffff; text-transform: uppercase; } +.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; color: #ffffff; } +.stat-row:last-child { border-bottom: none; } +.progress-bar { + background: #202020; + border-radius: 4px; + height: 8px; + margin-top: 10px; + overflow: hidden; +} +.progress-fill { + background: linear-gradient(90deg, #00aa55, #00cc66); + height: 100%; + transition: width 0.3s; +} +table { width: 100%; border-collapse: collapse; color: #ffffff; } +th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; } +th { color: #ffffff; font-size: 11px; text-transform: uppercase; } +td { font-size: 13px; color: #ffffff; } +.type-rss { color: #f90; } +.type-atom { color: #09f; } +.type-unknown { color: #ffffff; } +.url { + max-width: 400px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + color: #4a9eff; +} +.time { color: #ffffff; font-size: 12px; } +.updated { color: #ffffff; font-size: 11px; text-align: right; margin-top: 20px; } + +/* Search */ +#searchInput:focus { outline: none; border-color: #0af; } +#searchInput::placeholder { color: #555; } +.search-host { margin-bottom: 10px; } +.search-feed:hover { background: #1a1a1a; } diff --git a/static/dashboard.js b/static/dashboard.js new file mode 100644 index 0000000..796d452 --- /dev/null +++ b/static/dashboard.js @@ -0,0 +1,519 @@ +function initDashboard() { + function commaFormat(n) { + return n.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ','); + } + + function escapeHtml(text) { + if (text == null) return ''; + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; + } + + // All domains state + let allDomainsOffset = 0; + let allDomainsLoading = false; + let allDomainsEnd = false; + let expandedDomain = null; + let expandedFeed = null; + const PAGE_SIZE = 100; + const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom + + // Search state + let searchTimeout = null; + let isSearching = false; + + async function loadMoreDomains() { + if (allDomainsLoading || allDomainsEnd) return; + + allDomainsLoading = true; + const loadingEl = document.getElementById('allDomainsLoading'); + loadingEl.style.display = 'block'; + + try { + const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE); + const domains = await response.json(); + + if (!domains || domains.length === 0) { + allDomainsEnd = true; + loadingEl.style.display = 'none'; + return; + } + + const container = document.getElementById('allDomains'); + domains.forEach(d => { + const row = document.createElement('div'); + row.className = 'domain-row'; + row.innerHTML = + '
' + + '' + escapeHtml(d.host) + '' + + '' + commaFormat(d.feeds_found) + '' + + '
' + + ''; + + row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row)); + container.appendChild(row); + }); + + allDomainsOffset += domains.length; + loadingEl.style.display = 'none'; + + // If we got fewer than PAGE_SIZE, we've reached the end + if (domains.length < PAGE_SIZE) { + allDomainsEnd = true; + } + } catch (err) { + console.error('Failed to load domains:', err); + } finally { + allDomainsLoading = false; + } + } + + async function toggleDomainFeeds(host, rowEl) { + const feedsDiv = rowEl.querySelector('.domain-feeds'); + + // Close previously expanded domain + if (expandedDomain && expandedDomain !== rowEl) { + expandedDomain.querySelector('.domain-feeds').style.display = 'none'; + } + + // Toggle current + if (feedsDiv.style.display === 'none') { + feedsDiv.style.display = 'block'; + feedsDiv.innerHTML = '
Loading feeds...
'; + expandedDomain = rowEl; + + try { + const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host)); + const feeds = await response.json(); + + if (!feeds || feeds.length === 0) { + feedsDiv.innerHTML = '
No feeds found
'; + } else { + feedsDiv.innerHTML = ''; + feeds.forEach(f => { + const feedItem = document.createElement('div'); + feedItem.className = 'feed-item'; + feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;'; + feedItem.innerHTML = + '
' + + '
' + escapeHtml(f.url) + '
' + + (f.title ? '
' + escapeHtml(f.title) + '
' : '') + + '
' + (f.type || 'unknown') + '
' + + '
' + + ''; + + feedItem.querySelector('.feed-header').addEventListener('click', (e) => { + e.stopPropagation(); + toggleFeedInfo(f.url, feedItem); + }); + feedsDiv.appendChild(feedItem); + }); + } + } catch (err) { + feedsDiv.innerHTML = '
Error loading feeds
'; + } + } else { + feedsDiv.style.display = 'none'; + expandedDomain = null; + } + } + + async function toggleFeedInfo(feedUrl, feedItemEl) { + const detailsDiv = feedItemEl.querySelector('.feed-details'); + + // Close previously expanded feed + if (expandedFeed && expandedFeed !== feedItemEl) { + expandedFeed.querySelector('.feed-details').style.display = 'none'; + } + + // Toggle current + if (detailsDiv.style.display === 'none') { + detailsDiv.style.display = 'block'; + detailsDiv.innerHTML = '
Loading feed info...
'; + expandedFeed = feedItemEl; + + // Scroll the feed item to the top of the viewport + feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' }); + + try { + // Fetch feed info and items in parallel + const [infoResponse, itemsResponse] = await Promise.all([ + fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)), + fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50') + ]); + const info = await infoResponse.json(); + const items = await itemsResponse.json(); + + let html = '
'; + + if (info.description) { + html += '
' + escapeHtml(info.description) + '
'; + } + + html += ''; + + if (info.siteUrl) { + html += ''; + } + if (info.language) { + html += ''; + } + if (info.status) { + html += ''; + } + if (info.itemCount) { + html += ''; + } + if (info.avgPostFreqHrs) { + html += ''; + } + if (info.ttlMinutes) { + html += ''; + } + if (info.updatePeriod) { + let updateStr = info.updatePeriod; + if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')'; + html += ''; + } + if (info.lastBuildDate) { + html += ''; + } + if (info.newestItemDate) { + html += ''; + } + if (info.oldestItemDate) { + html += ''; + } + if (info.discoveredAt) { + html += ''; + } + if (info.lastCrawledAt) { + html += ''; + } + if (info.errorCount > 0) { + html += ''; + } + if (info.lastError) { + html += ''; + } + + html += '
Site' + escapeHtml(info.siteUrl) + '
Language' + escapeHtml(info.language) + '
Status' + escapeHtml(info.status) + '
Items' + commaFormat(info.itemCount) + '
Avg Post Freq' + info.avgPostFreqHrs.toFixed(1) + ' hrs
TTL' + info.ttlMinutes + ' min
Update' + escapeHtml(updateStr) + '
Last Build' + escapeHtml(info.lastBuildDate) + '
Newest Item' + escapeHtml(info.newestItemDate) + '
Oldest Item' + escapeHtml(info.oldestItemDate) + '
Discovered' + escapeHtml(info.discoveredAt) + '
Last Crawled' + escapeHtml(info.lastCrawledAt) + '
Errors' + info.errorCount + '
Last Error' + escapeHtml(info.lastError) + '
'; + + // Display items + if (items && items.length > 0) { + html += '
'; + html += '
Recent Items (' + items.length + ')
'; + + items.forEach(item => { + html += '
'; + + // Title with link + if (item.title) { + if (item.link) { + html += '
' + escapeHtml(item.title) + '
'; + } else { + html += '
' + escapeHtml(item.title) + '
'; + } + } else if (item.link) { + html += '
' + escapeHtml(item.link) + '
'; + } + + // Metadata line (date, author) + let meta = []; + if (item.pub_date) { + const date = new Date(item.pub_date); + meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString()); + } + if (item.author) { + meta.push(escapeHtml(item.author)); + } + if (meta.length > 0) { + html += '
' + meta.join(' • ') + '
'; + } + + html += '
'; + }); + + html += '
'; + } + + html += '
'; + + detailsDiv.innerHTML = html; + } catch (err) { + detailsDiv.innerHTML = '
Error loading feed info
'; + } + } else { + detailsDiv.style.display = 'none'; + expandedFeed = null; + } + } + + // Infinite scroll handler with prefetch (uses window scroll) + function setupInfiniteScroll() { + window.addEventListener('scroll', () => { + // Check if we're near the bottom of the page + const scrollBottom = window.scrollY + window.innerHeight; + const docHeight = document.documentElement.scrollHeight; + const remainingPixels = docHeight - scrollBottom; + + // Prefetch when within 500px of the bottom + if (remainingPixels < 500) { + loadMoreDomains(); + } + }); + } + + // Search functionality + function setupSearch() { + const searchInput = document.getElementById('searchInput'); + const searchResults = document.getElementById('searchResults'); + const domainsContainer = document.getElementById('allDomainsContainer'); + + if (!searchInput || !searchResults || !domainsContainer) { + console.error('Search elements not found'); + return; + } + + searchInput.addEventListener('input', (e) => { + const query = e.target.value.trim(); + + // Clear previous timeout + if (searchTimeout) { + clearTimeout(searchTimeout); + } + + // If empty, show domains list + if (!query) { + searchResults.style.display = 'none'; + domainsContainer.style.display = 'block'; + isSearching = false; + return; + } + + // Debounce search + searchTimeout = setTimeout(() => performSearch(query), 300); + }); + + // Handle Enter key + searchInput.addEventListener('keydown', (e) => { + if (e.key === 'Enter') { + const query = e.target.value.trim(); + if (query) { + if (searchTimeout) clearTimeout(searchTimeout); + performSearch(query); + } + } + }); + } + + async function performSearch(query) { + const searchResults = document.getElementById('searchResults'); + const domainsContainer = document.getElementById('allDomainsContainer'); + + isSearching = true; + domainsContainer.style.display = 'none'; + searchResults.style.display = 'block'; + searchResults.innerHTML = '
Searching...
'; + + try { + const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200'); + const results = await response.json(); + + if (!results || results.length === 0) { + searchResults.innerHTML = '
No results found
'; + return; + } + + // Group results by host + const byHost = {}; + results.forEach(r => { + const host = r.feed.source_host || 'unknown'; + if (!byHost[host]) { + byHost[host] = []; + } + byHost[host].push(r); + }); + + // Render results + searchResults.innerHTML = ''; + + Object.keys(byHost).sort().forEach(host => { + const hostDiv = document.createElement('div'); + hostDiv.className = 'search-host'; + + // Host header + const hostHeader = document.createElement('div'); + hostHeader.className = 'stat-row'; + hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;'; + hostHeader.innerHTML = '' + escapeHtml(host) + '' + byHost[host].length + ' feed(s)'; + + const feedsContainer = document.createElement('div'); + feedsContainer.style.display = 'block'; + + byHost[host].forEach(result => { + const feedDiv = document.createElement('div'); + feedDiv.className = 'search-feed'; + feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;'; + + // Feed header + let feedHtml = '
' + escapeHtml(result.feed.url) + '
'; + if (result.feed.title) { + feedHtml += '
' + escapeHtml(result.feed.title) + '
'; + } + if (result.feed.description) { + feedHtml += '
' + escapeHtml(result.feed.description.substring(0, 200)) + '
'; + } + + // Items + if (result.items && result.items.length > 0) { + feedHtml += '
'; + result.items.forEach(item => { + feedHtml += '
'; + if (item.title) { + if (item.link) { + feedHtml += '' + escapeHtml(item.title) + ''; + } else { + feedHtml += '' + escapeHtml(item.title) + ''; + } + } + let meta = []; + if (item.pub_date) { + meta.push(item.pub_date.substring(0, 10)); + } + if (item.author) { + meta.push(escapeHtml(item.author)); + } + if (meta.length > 0) { + feedHtml += '
' + meta.join(' • ') + '
'; + } + feedHtml += '
'; + }); + feedHtml += '
'; + } + + feedDiv.innerHTML = feedHtml; + + // Click on feed URL to toggle full feed info + feedDiv.querySelector('.feed-url').addEventListener('click', () => { + toggleSearchFeedInfo(result.feed.url, feedDiv); + }); + + feedsContainer.appendChild(feedDiv); + }); + + hostHeader.addEventListener('click', () => { + feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none'; + }); + + hostDiv.appendChild(hostHeader); + hostDiv.appendChild(feedsContainer); + searchResults.appendChild(hostDiv); + }); + + } catch (err) { + console.error('Search failed:', err); + searchResults.innerHTML = '
Search failed: ' + escapeHtml(err.message) + '
'; + } + } + + async function toggleSearchFeedInfo(feedUrl, feedDiv) { + let detailsDiv = feedDiv.querySelector('.feed-details-expanded'); + + if (detailsDiv) { + detailsDiv.remove(); + return; + } + + detailsDiv = document.createElement('div'); + detailsDiv.className = 'feed-details-expanded'; + detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;'; + detailsDiv.innerHTML = '
Loading feed info...
'; + feedDiv.appendChild(detailsDiv); + + try { + const [infoResponse, itemsResponse] = await Promise.all([ + fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)), + fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20') + ]); + const info = await infoResponse.json(); + const items = await itemsResponse.json(); + + let html = ''; + if (info.siteUrl) html += ''; + if (info.language) html += ''; + if (info.status) html += ''; + if (info.itemCount) html += ''; + if (info.avgPostFreqHrs) html += ''; + if (info.newestItemDate) html += ''; + html += '
Site' + escapeHtml(info.siteUrl) + '
Language' + escapeHtml(info.language) + '
Status' + escapeHtml(info.status) + '
Items' + commaFormat(info.itemCount) + '
Avg Freq' + info.avgPostFreqHrs.toFixed(1) + ' hrs
Newest' + escapeHtml(info.newestItemDate) + '
'; + + if (items && items.length > 0) { + html += '
'; + html += '
All Items (' + items.length + ')
'; + items.forEach(item => { + html += '
'; + if (item.title && item.link) { + html += '' + escapeHtml(item.title) + ''; + } else if (item.title) { + html += '' + escapeHtml(item.title) + ''; + } + html += '
'; + }); + html += '
'; + } + + detailsDiv.innerHTML = html; + } catch (err) { + detailsDiv.innerHTML = '
Failed to load feed info
'; + } + } + + async function updateStats() { + try { + const response = await fetch('/api/stats'); + const stats = await response.json(); + + // Update domain stats + document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains); + document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains); + document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains); + document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate); + document.getElementById('checkRate').textContent = commaFormat(stats.check_rate); + + // Update progress bar + const progress = stats.total_domains > 0 + ? (stats.checked_domains * 100 / stats.total_domains).toFixed(1) + : 0; + document.getElementById('crawlProgress').style.width = progress + '%'; + + // Update feed stats + document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds); + document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds); + document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds); + document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds); + + // Update timestamp + const updatedAt = new Date(stats.updated_at); + document.getElementById('updatedAt').textContent = 'Last updated: ' + + updatedAt.toISOString().replace('T', ' ').substring(0, 19); + + } catch (err) { + console.error('Failed to update stats:', err); + } + } + + // Initialize + try { + setupSearch(); + } catch (e) { + console.error('setupSearch failed:', e); + } + setupInfiniteScroll(); + loadMoreDomains(); + updateStats(); + setInterval(updateStats, 1000); +} + +window.onload = initDashboard;