Files
crawler/dashboard.go
2026-01-26 16:02:05 -05:00

743 lines
20 KiB
Go

package main
import (
"database/sql"
"encoding/json"
"fmt"
"html/template"
"net/http"
"time"
)
// DashboardStats holds all statistics for the dashboard
type DashboardStats struct {
// Domain stats
TotalDomains int `json:"total_domains"`
CheckedDomains int `json:"checked_domains"`
UncheckedDomains int `json:"unchecked_domains"`
// Feed stats
TotalFeeds int `json:"total_feeds"`
RSSFeeds int `json:"rss_feeds"`
AtomFeeds int `json:"atom_feeds"`
UnknownFeeds int `json:"unknown_feeds"`
// Crawl progress
HostsProcessed int32 `json:"hosts_processed"`
CrawlRate int `json:"crawl_rate"` // crawls per minute
CheckRate int `json:"check_rate"` // feed checks per minute
// Timing
UpdatedAt time.Time `json:"updated_at"`
}
type TLDStat struct {
TLD string `json:"tld"`
Count int `json:"count"`
}
type RecentFeed struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
DiscoveredAt time.Time `json:"discovered_at"`
}
type DomainStat struct {
Host string `json:"host"`
FeedsFound int `json:"feeds_found"`
}
// commaFormat formats an integer with comma separators
func commaFormat(n int) string {
s := fmt.Sprintf("%d", n)
if len(s) <= 3 {
return s
}
var result []byte
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
result = append(result, ',')
}
result = append(result, byte(c))
}
return string(result)
}
// UpdateStats recalculates and caches dashboard statistics
func (c *Crawler) UpdateStats() {
fmt.Println("UpdateStats: calculating stats...")
stats, err := c.calculateStats()
if err != nil {
fmt.Printf("UpdateStats: error calculating stats: %v\n", err)
return
}
// Cache all domains with feeds (runs in background, so slow query is OK)
fmt.Println("UpdateStats: fetching all domains...")
allDomains := c.fetchAllDomainsFromDB()
fmt.Printf("UpdateStats: got %d domains\n", len(allDomains))
c.statsMu.Lock()
c.cachedStats = stats
c.cachedAllDomains = allDomains
c.statsMu.Unlock()
fmt.Println("UpdateStats: complete")
}
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
rows, err := c.db.Query(`
SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds
GROUP BY tld, sourceHost
ORDER BY tld, sourceHost
`)
if err != nil {
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
return nil
}
defer rows.Close()
var domains []DomainStat
for rows.Next() {
var ds DomainStat
var tld string
if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil {
continue
}
domains = append(domains, ds)
}
return domains
}
// GetDashboardStats returns cached statistics (returns empty stats if not yet cached)
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
c.statsMu.RLock()
stats := c.cachedStats
c.statsMu.RUnlock()
if stats != nil {
return stats, nil
}
// Return empty stats while background calculation runs (don't block HTTP requests)
return &DashboardStats{UpdatedAt: time.Now()}, nil
}
// calculateStats collects all statistics for the dashboard
func (c *Crawler) calculateStats() (*DashboardStats, error) {
stats := &DashboardStats{
UpdatedAt: time.Now(),
HostsProcessed: c.hostsProcessed,
}
// Calculate crawl rate (crawls per minute), smoothed by +/-1 per update
elapsed := time.Since(c.startTime).Minutes()
if elapsed > 0 {
actualRate := int(float64(c.hostsProcessed) / elapsed)
if actualRate > c.displayedCrawlRate {
c.displayedCrawlRate++
} else if actualRate < c.displayedCrawlRate {
c.displayedCrawlRate--
}
stats.CrawlRate = c.displayedCrawlRate
// Calculate check rate (feed checks per minute), smoothed by +/-1 per update
actualCheckRate := int(float64(c.feedsChecked) / elapsed)
if actualCheckRate > c.displayedCheckRate {
c.displayedCheckRate++
} else if actualCheckRate < c.displayedCheckRate {
c.displayedCheckRate--
}
stats.CheckRate = c.displayedCheckRate
}
// Get domain stats
if err := c.collectDomainStats(stats); err != nil {
return nil, err
}
// Get feed stats
if err := c.collectFeedStats(stats); err != nil {
return nil, err
}
return stats, nil
}
func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
// Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains)
if err != nil {
return err
}
// Single query to get all status counts (one index scan instead of three)
rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "checked":
stats.CheckedDomains = count
case "unchecked":
stats.UncheckedDomains = count
}
}
if err := rows.Err(); err != nil {
return err
}
return rows.Err()
}
func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
// Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds)
if err != nil {
return err
}
// Single query to get all type counts (one index scan instead of three)
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var feedType sql.NullString
var count int
if err := rows.Scan(&feedType, &count); err != nil {
continue
}
switch feedType.String {
case "rss":
stats.RSSFeeds = count
case "atom":
stats.AtomFeeds = count
default:
stats.UnknownFeeds += count
}
}
return rows.Err()
}
// StartDashboard starts the web dashboard server
func (c *Crawler) StartDashboard(addr string) error {
http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
c.handleDashboard(w, r)
})
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIStats(w, r)
})
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIAllDomains(w, r)
})
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomainFeeds(w, r)
})
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedInfo(w, r)
})
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedItems(w, r)
})
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISearch(w, r)
})
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
})
fmt.Printf("Dashboard running at http://%s\n", addr)
return http.ListenAndServe(addr, nil)
}
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
rows, err := c.db.Query(`
SELECT url, title, type FROM feeds
WHERE sourceHost = ?
ORDER BY url ASC
LIMIT 1000
`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title sql.NullString
if err := rows.Scan(&f.URL, &title, &f.Type); err != nil {
continue
}
if title.Valid {
f.Title = title.String
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
type FeedDetails struct {
URL string `json:"url"`
Type string `json:"type,omitempty"`
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"siteUrl,omitempty"`
DiscoveredAt string `json:"discoveredAt,omitempty"`
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
LastBuildDate string `json:"lastBuildDate,omitempty"`
TTLMinutes int `json:"ttlMinutes,omitempty"`
UpdatePeriod string `json:"updatePeriod,omitempty"`
UpdateFreq int `json:"updateFreq,omitempty"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"errorCount,omitempty"`
LastError string `json:"lastError,omitempty"`
ItemCount int `json:"itemCount,omitempty"`
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
OldestItemDate string `json:"oldestItemDate,omitempty"`
NewestItemDate string `json:"newestItemDate,omitempty"`
}
var f FeedDetails
var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString
var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString
var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, lastBuildDate,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate
FROM feeds WHERE url = ?
`, feedURL).Scan(
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
&f.DiscoveredAt, &lastCrawledAt, &lastBuildDate,
&ttlMinutes, &updatePeriod, &updateFreq,
&status, &errorCount, &lastError,
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
)
if err == sql.ErrNoRows {
http.Error(w, "feed not found", http.StatusNotFound)
return
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if title.Valid {
f.Title = title.String
}
if description.Valid {
f.Description = description.String
}
if language.Valid {
f.Language = language.String
}
if siteUrl.Valid {
f.SiteURL = siteUrl.String
}
if lastCrawledAt.Valid {
f.LastCrawledAt = lastCrawledAt.String
}
if lastBuildDate.Valid {
f.LastBuildDate = lastBuildDate.String
}
if ttlMinutes.Valid {
f.TTLMinutes = int(ttlMinutes.Int64)
}
if updatePeriod.Valid {
f.UpdatePeriod = updatePeriod.String
}
if updateFreq.Valid {
f.UpdateFreq = int(updateFreq.Int64)
}
if status.Valid {
f.Status = status.String
}
if errorCount.Valid {
f.ErrorCount = int(errorCount.Int64)
}
if lastError.Valid {
f.LastError = lastError.String
}
if itemCount.Valid {
f.ItemCount = int(itemCount.Int64)
}
if avgPostFreqHrs.Valid {
f.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
f.OldestItemDate = oldestItemDate.String
}
if newestItemDate.Valid {
f.NewestItemDate = newestItemDate.String
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(f)
}
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
limit := 50
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
items, err := c.GetItemsByFeed(feedURL, limit)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if items == nil {
items = []*Item{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(items)
}
// SearchResult represents a search result with feed and matching items
type SearchResult struct {
Feed SearchFeed `json:"feed"`
Items []SearchItem `json:"items"`
}
type SearchFeed struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
Type string `json:"type"`
SourceHost string `json:"source_host"`
Status string `json:"status"`
}
type SearchItem struct {
ID int64 `json:"id"`
Title string `json:"title"`
Link string `json:"link"`
Description string `json:"description"`
Author string `json:"author"`
PubDate string `json:"pub_date"`
}
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
if query == "" {
http.Error(w, "q parameter required", http.StatusBadRequest)
return
}
limit := 100
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
// Results map: feedURL -> SearchResult
results := make(map[string]*SearchResult)
// Search feeds
feedRows, err := c.db.Query(`
SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
LIMIT ?
`, query, limit)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
var url string
var title, description, feedType, sourceHost, status sql.NullString
if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil {
continue
}
results[url] = &SearchResult{
Feed: SearchFeed{
URL: url,
Title: title.String,
Description: description.String,
Type: feedType.String,
SourceHost: sourceHost.String,
Status: status.String,
},
Items: []SearchItem{},
}
}
}
// Search items
itemRows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err == nil {
defer itemRows.Close()
for itemRows.Next() {
var id int64
var feedUrl string
var title, link, description, author, pubDate sql.NullString
if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil {
continue
}
item := SearchItem{
ID: id,
Title: title.String,
Link: link.String,
Description: description.String,
Author: author.String,
PubDate: pubDate.String,
}
// Add to existing result or create new one
if result, exists := results[feedUrl]; exists {
result.Items = append(result.Items, item)
} else {
// Fetch feed info for this item's feed
var fTitle, fDesc, fType, fHost, fStatus sql.NullString
c.db.QueryRow(`
SELECT title, description, type, sourceHost, status
FROM feeds WHERE url = ?
`, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus)
results[feedUrl] = &SearchResult{
Feed: SearchFeed{
URL: feedUrl,
Title: fTitle.String,
Description: fDesc.String,
Type: fType.String,
SourceHost: fHost.String,
Status: fStatus.String,
},
Items: []SearchItem{item},
}
}
}
}
// Convert map to slice
var resultList []SearchResult
for _, r := range results {
resultList = append(resultList, *r)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resultList)
}
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
funcMap := template.FuncMap{
"pct": func(a, b int) float64 {
if b == 0 {
return 0
}
return float64(a) * 100.0 / float64(b)
},
"comma": func(n interface{}) string {
var val int
switch v := n.(type) {
case int:
val = v
case int32:
val = int(v)
case int64:
val = int(v)
default:
return "0"
}
if val < 0 {
return "-" + commaFormat(-val)
}
return commaFormat(val)
},
}
tmpl, err := template.New("dashboard").Funcs(funcMap).Parse(dashboardHTML)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "text/html")
tmpl.Execute(w, stats)
}
func (c *Crawler) handleAPIStats(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
const dashboardHTML = `<!DOCTYPE html>
<html>
<head>
<title>1440.news Feed Crawler</title>
<meta charset="utf-8">
<link rel="stylesheet" href="/static/dashboard.css">
<script src="/static/dashboard.js"></script>
</head>
<body>
<h1>1440.news Feed Crawler</h1>
<h2>Crawl Progress</h2>
<div class="grid">
<div class="card">
<div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
<div class="stat-label">Domains</div>
</div>
<div class="card">
<div class="stat-value" id="checkedDomains">{{comma .CheckedDomains}}</div>
<div class="stat-label">Checked</div>
<div class="progress-bar">
<div class="progress-fill" id="crawlProgress" style="width: {{printf "%.1f" (pct .CheckedDomains .TotalDomains)}}%"></div>
</div>
</div>
<div class="card">
<div class="stat-value" id="uncheckedDomains">{{comma .UncheckedDomains}}</div>
<div class="stat-label">Unchecked</div>
</div>
<div class="card">
<div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
<div class="stat-label">crawls per min</div>
</div>
<div class="card">
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
<div class="stat-label">checks per min</div>
</div>
</div>
<h2>Feeds Discovered</h2>
<div class="grid">
<div class="card">
<div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
<div class="stat-label">Total Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
<div class="stat-label">RSS Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
<div class="stat-label">Atom Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
<div class="stat-label">Unknown Type</div>
</div>
</div>
<div class="card">
<h2 style="margin-top: 0;">Feeds</h2>
<div style="margin-bottom: 15px;">
<input type="text" id="searchInput" placeholder="Search feeds and items..."
style="width: 100%; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px;">
</div>
<div id="searchResults" style="display: none;"></div>
<div id="allDomainsContainer">
<div id="allDomains"></div>
<div id="allDomainsLoading" style="text-align: center; padding: 10px; color: #666;">Loading...</div>
</div>
</div>
<div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
</body>
</html>`