diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2283752 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +1440.news +1440.db +feeds/ +*.gz +.git +.gitignore +.claude +CLAUDE.md diff --git a/.gitignore b/.gitignore index 460c35d..ef46060 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ go.* *.gz feeds/ feeds.db/ +1440.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c6d849c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM golang:1.24-alpine AS builder + +WORKDIR /app + +# Install build dependencies +RUN apk add --no-cache gcc musl-dev + +# Copy go mod files first for layer caching +COPY go.mod go.sum ./ +RUN go mod download + +# Copy source code +COPY *.go ./ +COPY static/ ./static/ + +# Build the binary +RUN CGO_ENABLED=1 go build -o 1440.news . + +# Runtime stage +FROM alpine:latest + +WORKDIR /app + +# Install runtime dependencies +RUN apk add --no-cache ca-certificates tzdata + +# Copy binary from builder +COPY --from=builder /app/1440.news . +COPY --from=builder /app/static ./static + +# Create feeds directory +RUN mkdir -p feeds + +# Expose dashboard port +EXPOSE 4321 + +CMD ["./1440.news"] diff --git a/crawler.go b/crawler.go index 215179c..1b54c09 100644 --- a/crawler.go +++ b/crawler.go @@ -1,9 +1,9 @@ package main import ( + "database/sql" "fmt" "io" - "math/rand" "net/http" "runtime" "strings" @@ -11,26 +11,33 @@ import ( "sync/atomic" "time" - "github.com/cockroachdb/pebble" "golang.org/x/net/html" ) type Crawler struct { - MaxDepth int - MaxPagesPerHost int - Timeout time.Duration - UserAgent string - visited sync.Map - feedsMu sync.Mutex - client *http.Client - hostsProcessed int32 - db *pebble.DB + MaxDepth int + MaxPagesPerHost int + Timeout time.Duration + UserAgent string + visited sync.Map + feedsMu sync.Mutex + client *http.Client + hostsProcessed int32 + feedsChecked int32 + startTime time.Time + db *sql.DB + displayedCrawlRate int + displayedCheckRate int + domainsImported int32 + cachedStats *DashboardStats + cachedAllDomains []DomainStat + statsMu sync.RWMutex } func NewCrawler(dbPath string) (*Crawler, error) { - db, err := pebble.Open(dbPath, &pebble.Options{}) + db, err := OpenDatabase(dbPath) if err != nil { - return nil, fmt.Errorf("failed to open pebble db: %v", err) + return nil, fmt.Errorf("failed to open database: %v", err) } return &Crawler{ @@ -38,6 +45,7 @@ func NewCrawler(dbPath string) (*Crawler, error) { MaxPagesPerHost: 10, Timeout: 10 * time.Second, UserAgent: "FeedCrawler/1.0", + startTime: time.Now(), db: db, client: &http.Client{ Timeout: 10 * time.Second, @@ -58,87 +66,121 @@ func (c *Crawler) Close() error { return nil } -// CrawlUncrawledDomains fetches uncrawled domains and crawls them -func (c *Crawler) CrawlUncrawledDomains() error { - domains, err := c.GetUncrawledDomains() - if err != nil { - return fmt.Errorf("failed to get uncrawled domains: %v", err) +// StartStatsLoop updates cached stats once per minute +func (c *Crawler) StartStatsLoop() { + for { + c.UpdateStats() + time.Sleep(1 * time.Minute) } +} - if len(domains) == 0 { - return nil +// StartCleanupLoop runs item cleanup once per week +func (c *Crawler) StartCleanupLoop() { + for { + deleted, err := c.CleanupOldItems() + if err != nil { + fmt.Printf("Cleanup error: %v\n", err) + } else if deleted > 0 { + fmt.Printf("Cleanup: removed %d old items\n", deleted) + } + time.Sleep(7 * 24 * time.Hour) } +} - // Shuffle for randomized crawling - rand.Shuffle(len(domains), func(i, j int) { - domains[i], domains[j] = domains[j], domains[i] - }) - - numWorkers := runtime.NumCPU() - 1 +// StartCrawlLoop runs the domain crawling loop independently +func (c *Crawler) StartCrawlLoop() { + numWorkers := runtime.NumCPU() if numWorkers < 1 { numWorkers = 1 } - type crawlResult struct { - host string - feedsFound int - lastError string - } - - domainChan := make(chan *Domain, numWorkers*2) - resultChan := make(chan crawlResult, numWorkers*2) - var wg sync.WaitGroup + // Buffered channel for domain work + workChan := make(chan *Domain, 256) // Start workers for i := 0; i < numWorkers; i++ { - wg.Add(1) go func() { - defer wg.Done() - for domain := range domainChan { + for domain := range workChan { feedsFound, crawlErr := c.crawlHost(domain.Host) errStr := "" if crawlErr != nil { errStr = crawlErr.Error() } - resultChan <- crawlResult{ - host: domain.Host, - feedsFound: feedsFound, - lastError: errStr, + if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil { + fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err) } } }() } - // Start result processor - done := make(chan bool) - go func() { - for result := range resultChan { - if err := c.markDomainCrawled(result.host, result.feedsFound, result.lastError); err != nil { - fmt.Printf("Error marking domain %s as crawled: %v\n", result.host, err) - } + const fetchSize = 100 + for { + domains, err := c.GetUncheckedDomainsRandom(fetchSize) + if err != nil { + fmt.Printf("Error fetching domains: %v\n", err) } - done <- true - }() - // Send domains to workers - for _, domain := range domains { - domainChan <- domain + if len(domains) == 0 { + c.displayedCrawlRate = 0 + time.Sleep(1 * time.Second) + continue + } + + fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains)) + + for _, domain := range domains { + workChan <- domain + } + + time.Sleep(1 * time.Second) + } +} + +// StartCheckLoop runs the feed checking loop independently +func (c *Crawler) StartCheckLoop() { + numWorkers := runtime.NumCPU() + if numWorkers < 1 { + numWorkers = 1 } - close(domainChan) - wg.Wait() - close(resultChan) - <-done + // Buffered channel for feed work + workChan := make(chan *Feed, 256) - return nil + // Start workers + for i := 0; i < numWorkers; i++ { + go func() { + for feed := range workChan { + c.CheckFeed(feed) + } + }() + } + + const fetchSize = 100 + for { + feeds, err := c.GetFeedsDueForCheck(fetchSize) + if err != nil { + fmt.Printf("Error fetching feeds: %v\n", err) + } + + if len(feeds) == 0 { + c.displayedCheckRate = 0 + time.Sleep(1 * time.Second) + continue + } + + fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds)) + + for _, feed := range feeds { + workChan <- feed + } + + time.Sleep(1 * time.Second) + } } func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { atomic.AddInt32(&c.hostsProcessed, 1) - // Count feeds before crawling - initialCount, _ := c.GetFeedCount() - localVisited := make(map[string]bool) pagesVisited := 0 @@ -148,9 +190,8 @@ func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited) } - // Count feeds after crawling - finalCount, _ := c.GetFeedCount() - feedsFound = finalCount - initialCount + // Count feeds found for this specific host + feedsFound, _ = c.GetFeedCountByHost(host) if pagesVisited == 0 { return feedsFound, fmt.Errorf("could not connect") diff --git a/dashboard.go b/dashboard.go index a362b2d..99d1187 100644 --- a/dashboard.go +++ b/dashboard.go @@ -1,21 +1,20 @@ package main import ( + "database/sql" "encoding/json" "fmt" "html/template" "net/http" - "sort" "time" ) // DashboardStats holds all statistics for the dashboard type DashboardStats struct { // Domain stats - TotalDomains int `json:"total_domains"` - CrawledDomains int `json:"crawled_domains"` - UncrawledDomains int `json:"uncrawled_domains"` - ErrorDomains int `json:"error_domains"` + TotalDomains int `json:"total_domains"` + CheckedDomains int `json:"checked_domains"` + UncheckedDomains int `json:"unchecked_domains"` // Feed stats TotalFeeds int `json:"total_feeds"` @@ -25,16 +24,8 @@ type DashboardStats struct { // Crawl progress HostsProcessed int32 `json:"hosts_processed"` - CrawlRate float64 `json:"crawl_rate"` // domains per minute - - // Top TLDs by feed count - TopTLDs []TLDStat `json:"top_tlds"` - - // Recent feeds - RecentFeeds []RecentFeed `json:"recent_feeds"` - - // Top domains by feed count - TopDomains []DomainStat `json:"top_domains"` + CrawlRate int `json:"crawl_rate"` // crawls per minute + CheckRate int `json:"check_rate"` // feed checks per minute // Timing UpdatedAt time.Time `json:"updated_at"` @@ -57,13 +48,107 @@ type DomainStat struct { FeedsFound int `json:"feeds_found"` } -// GetDashboardStats collects all statistics for the dashboard +// commaFormat formats an integer with comma separators +func commaFormat(n int) string { + s := fmt.Sprintf("%d", n) + if len(s) <= 3 { + return s + } + var result []byte + for i, c := range s { + if i > 0 && (len(s)-i)%3 == 0 { + result = append(result, ',') + } + result = append(result, byte(c)) + } + return string(result) +} + +// UpdateStats recalculates and caches dashboard statistics +func (c *Crawler) UpdateStats() { + fmt.Println("UpdateStats: calculating stats...") + stats, err := c.calculateStats() + if err != nil { + fmt.Printf("UpdateStats: error calculating stats: %v\n", err) + return + } + // Cache all domains with feeds (runs in background, so slow query is OK) + fmt.Println("UpdateStats: fetching all domains...") + allDomains := c.fetchAllDomainsFromDB() + fmt.Printf("UpdateStats: got %d domains\n", len(allDomains)) + + c.statsMu.Lock() + c.cachedStats = stats + c.cachedAllDomains = allDomains + c.statsMu.Unlock() + fmt.Println("UpdateStats: complete") +} + +func (c *Crawler) fetchAllDomainsFromDB() []DomainStat { + rows, err := c.db.Query(` + SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds + GROUP BY tld, sourceHost + ORDER BY tld, sourceHost + `) + if err != nil { + fmt.Printf("fetchAllDomainsFromDB error: %v\n", err) + return nil + } + defer rows.Close() + + var domains []DomainStat + for rows.Next() { + var ds DomainStat + var tld string + if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil { + continue + } + domains = append(domains, ds) + } + return domains +} + +// GetDashboardStats returns cached statistics (returns empty stats if not yet cached) func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { + c.statsMu.RLock() + stats := c.cachedStats + c.statsMu.RUnlock() + + if stats != nil { + return stats, nil + } + // Return empty stats while background calculation runs (don't block HTTP requests) + return &DashboardStats{UpdatedAt: time.Now()}, nil +} + +// calculateStats collects all statistics for the dashboard +func (c *Crawler) calculateStats() (*DashboardStats, error) { stats := &DashboardStats{ - UpdatedAt: time.Now(), + UpdatedAt: time.Now(), HostsProcessed: c.hostsProcessed, } + // Calculate crawl rate (crawls per minute), smoothed by +/-1 per update + elapsed := time.Since(c.startTime).Minutes() + if elapsed > 0 { + actualRate := int(float64(c.hostsProcessed) / elapsed) + if actualRate > c.displayedCrawlRate { + c.displayedCrawlRate++ + } else if actualRate < c.displayedCrawlRate { + c.displayedCrawlRate-- + } + stats.CrawlRate = c.displayedCrawlRate + + // Calculate check rate (feed checks per minute), smoothed by +/-1 per update + actualCheckRate := int(float64(c.feedsChecked) / elapsed) + if actualCheckRate > c.displayedCheckRate { + c.displayedCheckRate++ + } else if actualCheckRate < c.displayedCheckRate { + c.displayedCheckRate-- + } + stats.CheckRate = c.displayedCheckRate + } + // Get domain stats if err := c.collectDomainStats(stats); err != nil { return nil, err @@ -78,148 +163,455 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { } func (c *Crawler) collectDomainStats(stats *DashboardStats) error { - iter, err := c.db.NewIter(nil) + // Use MAX(rowid) for fast approximate total count + err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains) if err != nil { return err } - defer iter.Close() - domainFeeds := make(map[string]int) + // Single query to get all status counts (one index scan instead of three) + rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status") + if err != nil { + return err + } + defer rows.Close() - for iter.SeekGE([]byte("domain:")); iter.Valid(); iter.Next() { - key := string(iter.Key()) - if len(key) < 7 || key[:7] != "domain:" { - break - } - - var domain Domain - if err := json.Unmarshal(iter.Value(), &domain); err != nil { + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { continue } - - stats.TotalDomains++ - switch domain.Status { - case "crawled": - stats.CrawledDomains++ - if domain.FeedsFound > 0 { - domainFeeds[domain.Host] = domain.FeedsFound - } - case "uncrawled": - stats.UncrawledDomains++ - case "error": - stats.ErrorDomains++ + switch status { + case "checked": + stats.CheckedDomains = count + case "unchecked": + stats.UncheckedDomains = count } } - - // Top domains by feed count - type kv struct { - Host string - Count int - } - var sorted []kv - for h, c := range domainFeeds { - sorted = append(sorted, kv{h, c}) - } - sort.Slice(sorted, func(i, j int) bool { - return sorted[i].Count > sorted[j].Count - }) - for i := 0; i < len(sorted) && i < 10; i++ { - stats.TopDomains = append(stats.TopDomains, DomainStat{ - Host: sorted[i].Host, - FeedsFound: sorted[i].Count, - }) + if err := rows.Err(); err != nil { + return err } - return iter.Error() + return rows.Err() } func (c *Crawler) collectFeedStats(stats *DashboardStats) error { - iter, err := c.db.NewIter(nil) + // Use MAX(rowid) for fast approximate total count + err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds) if err != nil { return err } - defer iter.Close() - tldCounts := make(map[string]int) - var recentFeeds []RecentFeed + // Single query to get all type counts (one index scan instead of three) + rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type") + if err != nil { + return err + } + defer rows.Close() - for iter.SeekGE([]byte("feed:")); iter.Valid(); iter.Next() { - key := string(iter.Key()) - if len(key) < 5 || key[:5] != "feed:" { - break - } - - var feed Feed - if err := json.Unmarshal(iter.Value(), &feed); err != nil { + for rows.Next() { + var feedType sql.NullString + var count int + if err := rows.Scan(&feedType, &count); err != nil { continue } - - stats.TotalFeeds++ - switch feed.Type { + switch feedType.String { case "rss": - stats.RSSFeeds++ + stats.RSSFeeds = count case "atom": - stats.AtomFeeds++ + stats.AtomFeeds = count default: - stats.UnknownFeeds++ + stats.UnknownFeeds += count } - - if feed.TLD != "" { - tldCounts[feed.TLD]++ - } - - recentFeeds = append(recentFeeds, RecentFeed{ - URL: feed.URL, - Title: feed.Title, - Type: feed.Type, - DiscoveredAt: feed.DiscoveredAt, - }) } - - // Top TLDs - type kv struct { - TLD string - Count int - } - var sortedTLDs []kv - for t, c := range tldCounts { - sortedTLDs = append(sortedTLDs, kv{t, c}) - } - sort.Slice(sortedTLDs, func(i, j int) bool { - return sortedTLDs[i].Count > sortedTLDs[j].Count - }) - for i := 0; i < len(sortedTLDs) && i < 10; i++ { - stats.TopTLDs = append(stats.TopTLDs, TLDStat{ - TLD: sortedTLDs[i].TLD, - Count: sortedTLDs[i].Count, - }) - } - - // Recent feeds (last 20, sorted by discovery time) - sort.Slice(recentFeeds, func(i, j int) bool { - return recentFeeds[i].DiscoveredAt.After(recentFeeds[j].DiscoveredAt) - }) - if len(recentFeeds) > 20 { - recentFeeds = recentFeeds[:20] - } - stats.RecentFeeds = recentFeeds - - return iter.Error() + return rows.Err() } // StartDashboard starts the web dashboard server func (c *Crawler) StartDashboard(addr string) error { - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) { c.handleDashboard(w, r) }) http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) { c.handleAPIStats(w, r) }) + http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIAllDomains(w, r) + }) + http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDomainFeeds(w, r) + }) + http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFeedInfo(w, r) + }) + http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIFeedItems(w, r) + }) + http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) { + c.handleAPISearch(w, r) + }) + http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) { + http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r) + }) fmt.Printf("Dashboard running at http://%s\n", addr) return http.ListenAndServe(addr, nil) } +func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) { + offset := 0 + limit := 100 + if o := r.URL.Query().Get("offset"); o != "" { + fmt.Sscanf(o, "%d", &offset) + } + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 100 { + limit = 100 + } + } + + // Serve from cache (updated once per minute in background) + c.statsMu.RLock() + cached := c.cachedAllDomains + c.statsMu.RUnlock() + + var domains []DomainStat + if cached != nil && offset < len(cached) { + end := offset + limit + if end > len(cached) { + end = len(cached) + } + domains = cached[offset:end] + } + if domains == nil { + domains = []DomainStat{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(domains) +} + +func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { + host := r.URL.Query().Get("host") + if host == "" { + http.Error(w, "host parameter required", http.StatusBadRequest) + return + } + + rows, err := c.db.Query(` + SELECT url, title, type FROM feeds + WHERE sourceHost = ? + ORDER BY url ASC + LIMIT 1000 + `, host) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + defer rows.Close() + + type FeedInfo struct { + URL string `json:"url"` + Title string `json:"title"` + Type string `json:"type"` + } + + var feeds []FeedInfo + for rows.Next() { + var f FeedInfo + var title sql.NullString + if err := rows.Scan(&f.URL, &title, &f.Type); err != nil { + continue + } + if title.Valid { + f.Title = title.String + } + feeds = append(feeds, f) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(feeds) +} + +func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + type FeedDetails struct { + URL string `json:"url"` + Type string `json:"type,omitempty"` + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Language string `json:"language,omitempty"` + SiteURL string `json:"siteUrl,omitempty"` + DiscoveredAt string `json:"discoveredAt,omitempty"` + LastCrawledAt string `json:"lastCrawledAt,omitempty"` + LastBuildDate string `json:"lastBuildDate,omitempty"` + TTLMinutes int `json:"ttlMinutes,omitempty"` + UpdatePeriod string `json:"updatePeriod,omitempty"` + UpdateFreq int `json:"updateFreq,omitempty"` + Status string `json:"status,omitempty"` + ErrorCount int `json:"errorCount,omitempty"` + LastError string `json:"lastError,omitempty"` + ItemCount int `json:"itemCount,omitempty"` + AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"` + OldestItemDate string `json:"oldestItemDate,omitempty"` + NewestItemDate string `json:"newestItemDate,omitempty"` + } + + var f FeedDetails + var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString + var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString + var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64 + var avgPostFreqHrs sql.NullFloat64 + + err := c.db.QueryRow(` + SELECT url, type, title, description, language, siteUrl, + discoveredAt, lastCrawledAt, lastBuildDate, + ttlMinutes, updatePeriod, updateFreq, + status, errorCount, lastError, + itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate + FROM feeds WHERE url = ? + `, feedURL).Scan( + &f.URL, &f.Type, &title, &description, &language, &siteUrl, + &f.DiscoveredAt, &lastCrawledAt, &lastBuildDate, + &ttlMinutes, &updatePeriod, &updateFreq, + &status, &errorCount, &lastError, + &itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, + ) + + if err == sql.ErrNoRows { + http.Error(w, "feed not found", http.StatusNotFound) + return + } + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if title.Valid { + f.Title = title.String + } + if description.Valid { + f.Description = description.String + } + if language.Valid { + f.Language = language.String + } + if siteUrl.Valid { + f.SiteURL = siteUrl.String + } + if lastCrawledAt.Valid { + f.LastCrawledAt = lastCrawledAt.String + } + if lastBuildDate.Valid { + f.LastBuildDate = lastBuildDate.String + } + if ttlMinutes.Valid { + f.TTLMinutes = int(ttlMinutes.Int64) + } + if updatePeriod.Valid { + f.UpdatePeriod = updatePeriod.String + } + if updateFreq.Valid { + f.UpdateFreq = int(updateFreq.Int64) + } + if status.Valid { + f.Status = status.String + } + if errorCount.Valid { + f.ErrorCount = int(errorCount.Int64) + } + if lastError.Valid { + f.LastError = lastError.String + } + if itemCount.Valid { + f.ItemCount = int(itemCount.Int64) + } + if avgPostFreqHrs.Valid { + f.AvgPostFreqHrs = avgPostFreqHrs.Float64 + } + if oldestItemDate.Valid { + f.OldestItemDate = oldestItemDate.String + } + if newestItemDate.Valid { + f.NewestItemDate = newestItemDate.String + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(f) +} + +func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) { + feedURL := r.URL.Query().Get("url") + if feedURL == "" { + http.Error(w, "url parameter required", http.StatusBadRequest) + return + } + + limit := 50 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 100 { + limit = 100 + } + } + + items, err := c.GetItemsByFeed(feedURL, limit) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if items == nil { + items = []*Item{} + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(items) +} + +// SearchResult represents a search result with feed and matching items +type SearchResult struct { + Feed SearchFeed `json:"feed"` + Items []SearchItem `json:"items"` +} + +type SearchFeed struct { + URL string `json:"url"` + Title string `json:"title"` + Description string `json:"description"` + Type string `json:"type"` + SourceHost string `json:"source_host"` + Status string `json:"status"` +} + +type SearchItem struct { + ID int64 `json:"id"` + Title string `json:"title"` + Link string `json:"link"` + Description string `json:"description"` + Author string `json:"author"` + PubDate string `json:"pub_date"` +} + +func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { + query := r.URL.Query().Get("q") + if query == "" { + http.Error(w, "q parameter required", http.StatusBadRequest) + return + } + + limit := 100 + if l := r.URL.Query().Get("limit"); l != "" { + fmt.Sscanf(l, "%d", &limit) + if limit > 500 { + limit = 500 + } + } + + // Results map: feedURL -> SearchResult + results := make(map[string]*SearchResult) + + // Search feeds + feedRows, err := c.db.Query(` + SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status + FROM feeds f + JOIN feeds_fts fts ON f.rowid = fts.rowid + WHERE feeds_fts MATCH ? + LIMIT ? + `, query, limit) + if err == nil { + defer feedRows.Close() + for feedRows.Next() { + var url string + var title, description, feedType, sourceHost, status sql.NullString + if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil { + continue + } + results[url] = &SearchResult{ + Feed: SearchFeed{ + URL: url, + Title: title.String, + Description: description.String, + Type: feedType.String, + SourceHost: sourceHost.String, + Status: status.String, + }, + Items: []SearchItem{}, + } + } + } + + // Search items + itemRows, err := c.db.Query(` + SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate + FROM items i + JOIN items_fts fts ON i.id = fts.rowid + WHERE items_fts MATCH ? + ORDER BY i.pubDate DESC + LIMIT ? + `, query, limit) + if err == nil { + defer itemRows.Close() + for itemRows.Next() { + var id int64 + var feedUrl string + var title, link, description, author, pubDate sql.NullString + if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil { + continue + } + + item := SearchItem{ + ID: id, + Title: title.String, + Link: link.String, + Description: description.String, + Author: author.String, + PubDate: pubDate.String, + } + + // Add to existing result or create new one + if result, exists := results[feedUrl]; exists { + result.Items = append(result.Items, item) + } else { + // Fetch feed info for this item's feed + var fTitle, fDesc, fType, fHost, fStatus sql.NullString + c.db.QueryRow(` + SELECT title, description, type, sourceHost, status + FROM feeds WHERE url = ? + `, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus) + + results[feedUrl] = &SearchResult{ + Feed: SearchFeed{ + URL: feedUrl, + Title: fTitle.String, + Description: fDesc.String, + Type: fType.String, + SourceHost: fHost.String, + Status: fStatus.String, + }, + Items: []SearchItem{item}, + } + } + } + } + + // Convert map to slice + var resultList []SearchResult + for _, r := range results { + resultList = append(resultList, *r) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resultList) +} + func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { stats, err := c.GetDashboardStats() if err != nil { @@ -228,14 +620,28 @@ func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { } funcMap := template.FuncMap{ - "divf": func(a, b int) float64 { + "pct": func(a, b int) float64 { if b == 0 { return 0 } - return float64(a) / float64(b) + return float64(a) * 100.0 / float64(b) }, - "mulf": func(a int, b float64) float64 { - return float64(a) * b + "comma": func(n interface{}) string { + var val int + switch v := n.(type) { + case int: + val = v + case int32: + val = int(v) + case int64: + val = int(v) + default: + return "0" + } + if val < 0 { + return "-" + commaFormat(-val) + } + return commaFormat(val) }, } @@ -265,58 +671,8 @@ const dashboardHTML = `
| URL | -Title | -Type | -Discovered | -
|---|---|---|---|
| {{.URL}} | -{{if .Title}}{{.Title}}{{else}}-{{end}} | -{{.Type}} | -{{.DiscoveredAt.Format "15:04:05"}} | -
| No feeds discovered yet | |||