Files
crawler/dashboard.go
primal 7ec4207173 Migrate to normalized FK schema (domain_host, domain_tld)
Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 22:36:25 -05:00

266 lines
6.7 KiB
Go

package main
import (
"fmt"
"time"
)
// DashboardStats holds all statistics for the dashboard
type DashboardStats struct {
// Domain stats
TotalDomains int `json:"total_domains"`
HoldDomains int `json:"hold_domains"`
PassDomains int `json:"pass_domains"`
SkipDomains int `json:"skip_domains"`
DeadDomains int `json:"dead_domains"`
// Feed stats
TotalFeeds int `json:"total_feeds"`
AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds)
PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing)
SkipFeeds int `json:"skip_feeds"`
HoldFeeds int `json:"hold_feeds"`
DeadFeeds int `json:"dead_feeds"`
EmptyFeeds int `json:"empty_feeds"`
RSSFeeds int `json:"rss_feeds"`
AtomFeeds int `json:"atom_feeds"`
JSONFeeds int `json:"json_feeds"`
UnknownFeeds int `json:"unknown_feeds"`
// Processing rates (per minute)
DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count
DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute
FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute
FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute
// Timing
UpdatedAt time.Time `json:"updated_at"`
}
type TLDStat struct {
TLD string `json:"tld"`
Count int `json:"count"`
}
type RecentFeed struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
DiscoveredAt time.Time `json:"discovered_at"`
}
type DomainStat struct {
Host string `json:"host"`
FeedsFound int `json:"feeds_found"`
}
// commaFormat formats an integer with comma separators
func commaFormat(n int) string {
s := fmt.Sprintf("%d", n)
if len(s) <= 3 {
return s
}
var result []byte
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
result = append(result, ',')
}
result = append(result, byte(c))
}
return string(result)
}
// UpdateStats recalculates and caches dashboard statistics
func (c *Crawler) UpdateStats() {
fmt.Println("UpdateStats: calculating stats...")
stats, err := c.calculateStats()
if err != nil {
fmt.Printf("UpdateStats: error calculating stats: %v\n", err)
return
}
// Cache all domains with feeds (runs in background, so slow query is OK)
fmt.Println("UpdateStats: fetching all domains...")
allDomains := c.fetchAllDomainsFromDB()
fmt.Printf("UpdateStats: got %d domains\n", len(allDomains))
c.statsMu.Lock()
c.cachedStats = stats
c.cachedAllDomains = allDomains
c.statsMu.Unlock()
fmt.Println("UpdateStats: complete")
}
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
rows, err := c.db.Query(`
SELECT domain_tld as tld, domain_host || '.' || domain_tld as source_host, COUNT(*) as cnt FROM feeds
GROUP BY domain_tld, domain_host
ORDER BY domain_tld, domain_host
`)
if err != nil {
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
return nil
}
defer rows.Close()
var domains []DomainStat
for rows.Next() {
var ds DomainStat
var tld string
if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil {
continue
}
domains = append(domains, ds)
}
return domains
}
// GetDashboardStats returns cached statistics (returns empty stats if not yet cached)
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
c.statsMu.RLock()
stats := c.cachedStats
c.statsMu.RUnlock()
if stats != nil {
return stats, nil
}
// Return empty stats while background calculation runs (don't block HTTP requests)
return &DashboardStats{UpdatedAt: time.Now()}, nil
}
// calculateStats collects all statistics for the dashboard
func (c *Crawler) calculateStats() (*DashboardStats, error) {
stats := &DashboardStats{
UpdatedAt: time.Now(),
DomainsCrawled: c.domainsCrawled,
}
// Calculate rates (per minute)
elapsed := time.Since(c.startTime).Minutes()
if elapsed > 0 {
stats.DomainCheckRate = int(float64(c.domainsChecked) / elapsed)
stats.FeedCrawlRate = int(float64(c.domainsCrawled) / elapsed)
stats.FeedCheckRate = int(float64(c.feedsChecked) / elapsed)
}
// Get domain stats
if err := c.collectDomainStats(stats); err != nil {
return nil, err
}
// Get feed stats
if err := c.collectFeedStats(stats); err != nil {
return nil, err
}
return stats, nil
}
func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
// Use COUNT(*) for total count
err := c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&stats.TotalDomains)
if err != nil {
return err
}
// Single query to get all status counts (one index scan instead of three)
rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "hold":
stats.HoldDomains = count
case "pass":
stats.PassDomains = count
case "skip":
stats.SkipDomains = count
case "dead":
stats.DeadDomains = count
}
}
if err := rows.Err(); err != nil {
return err
}
return rows.Err()
}
func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
// Use COUNT(*) for total count
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&stats.TotalFeeds)
if err != nil {
return err
}
// Get status counts
statusRows, err := c.db.Query("SELECT status, COUNT(*) FROM feeds GROUP BY status")
if err != nil {
return err
}
defer statusRows.Close()
for statusRows.Next() {
var status *string
var count int
if err := statusRows.Scan(&status, &count); err != nil {
continue
}
if status != nil {
switch *status {
case "pass":
stats.AliveFeeds = count
case "skip":
stats.SkipFeeds = count
case "hold":
stats.HoldFeeds = count
case "dead":
stats.DeadFeeds = count
}
}
}
// Count feeds approved for publishing (publish_status='pass')
c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE publish_status = 'pass'").Scan(&stats.PublishFeeds)
// Count empty feeds (item_count = 0 or NULL)
c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE item_count IS NULL OR item_count = 0").Scan(&stats.EmptyFeeds)
// Single query to get all type counts (one index scan instead of three)
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var feedType *string
var count int
if err := rows.Scan(&feedType, &count); err != nil {
continue
}
if feedType == nil {
stats.UnknownFeeds += count
} else {
switch *feedType {
case "rss":
stats.RSSFeeds = count
case "atom":
stats.AtomFeeds = count
case "json":
stats.JSONFeeds = count
default:
stats.UnknownFeeds += count
}
}
}
return rows.Err()
}