Add Docker support and refactor data layer

This commit is contained in:
primal
2026-01-26 16:02:05 -05:00
parent 398e7b3969
commit 143807378f
12 changed files with 2642 additions and 518 deletions
+8
View File
@@ -0,0 +1,8 @@
1440.news
1440.db
feeds/
*.gz
.git
.gitignore
.claude
CLAUDE.md
+1
View File
@@ -3,3 +3,4 @@ go.*
*.gz *.gz
feeds/ feeds/
feeds.db/ feeds.db/
1440.db
+37
View File
@@ -0,0 +1,37 @@
FROM golang:1.24-alpine AS builder
WORKDIR /app
# Install build dependencies
RUN apk add --no-cache gcc musl-dev
# Copy go mod files first for layer caching
COPY go.mod go.sum ./
RUN go mod download
# Copy source code
COPY *.go ./
COPY static/ ./static/
# Build the binary
RUN CGO_ENABLED=1 go build -o 1440.news .
# Runtime stage
FROM alpine:latest
WORKDIR /app
# Install runtime dependencies
RUN apk add --no-cache ca-certificates tzdata
# Copy binary from builder
COPY --from=builder /app/1440.news .
COPY --from=builder /app/static ./static
# Create feeds directory
RUN mkdir -p feeds
# Expose dashboard port
EXPOSE 4321
CMD ["./1440.news"]
+106 -65
View File
@@ -1,9 +1,9 @@
package main package main
import ( import (
"database/sql"
"fmt" "fmt"
"io" "io"
"math/rand"
"net/http" "net/http"
"runtime" "runtime"
"strings" "strings"
@@ -11,26 +11,33 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"github.com/cockroachdb/pebble"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
type Crawler struct { type Crawler struct {
MaxDepth int MaxDepth int
MaxPagesPerHost int MaxPagesPerHost int
Timeout time.Duration Timeout time.Duration
UserAgent string UserAgent string
visited sync.Map visited sync.Map
feedsMu sync.Mutex feedsMu sync.Mutex
client *http.Client client *http.Client
hostsProcessed int32 hostsProcessed int32
db *pebble.DB feedsChecked int32
startTime time.Time
db *sql.DB
displayedCrawlRate int
displayedCheckRate int
domainsImported int32
cachedStats *DashboardStats
cachedAllDomains []DomainStat
statsMu sync.RWMutex
} }
func NewCrawler(dbPath string) (*Crawler, error) { func NewCrawler(dbPath string) (*Crawler, error) {
db, err := pebble.Open(dbPath, &pebble.Options{}) db, err := OpenDatabase(dbPath)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to open pebble db: %v", err) return nil, fmt.Errorf("failed to open database: %v", err)
} }
return &Crawler{ return &Crawler{
@@ -38,6 +45,7 @@ func NewCrawler(dbPath string) (*Crawler, error) {
MaxPagesPerHost: 10, MaxPagesPerHost: 10,
Timeout: 10 * time.Second, Timeout: 10 * time.Second,
UserAgent: "FeedCrawler/1.0", UserAgent: "FeedCrawler/1.0",
startTime: time.Now(),
db: db, db: db,
client: &http.Client{ client: &http.Client{
Timeout: 10 * time.Second, Timeout: 10 * time.Second,
@@ -58,87 +66,121 @@ func (c *Crawler) Close() error {
return nil return nil
} }
// CrawlUncrawledDomains fetches uncrawled domains and crawls them // StartStatsLoop updates cached stats once per minute
func (c *Crawler) CrawlUncrawledDomains() error { func (c *Crawler) StartStatsLoop() {
domains, err := c.GetUncrawledDomains() for {
if err != nil { c.UpdateStats()
return fmt.Errorf("failed to get uncrawled domains: %v", err) time.Sleep(1 * time.Minute)
} }
}
if len(domains) == 0 { // StartCleanupLoop runs item cleanup once per week
return nil func (c *Crawler) StartCleanupLoop() {
for {
deleted, err := c.CleanupOldItems()
if err != nil {
fmt.Printf("Cleanup error: %v\n", err)
} else if deleted > 0 {
fmt.Printf("Cleanup: removed %d old items\n", deleted)
}
time.Sleep(7 * 24 * time.Hour)
} }
}
// Shuffle for randomized crawling // StartCrawlLoop runs the domain crawling loop independently
rand.Shuffle(len(domains), func(i, j int) { func (c *Crawler) StartCrawlLoop() {
domains[i], domains[j] = domains[j], domains[i] numWorkers := runtime.NumCPU()
})
numWorkers := runtime.NumCPU() - 1
if numWorkers < 1 { if numWorkers < 1 {
numWorkers = 1 numWorkers = 1
} }
type crawlResult struct { // Buffered channel for domain work
host string workChan := make(chan *Domain, 256)
feedsFound int
lastError string
}
domainChan := make(chan *Domain, numWorkers*2)
resultChan := make(chan crawlResult, numWorkers*2)
var wg sync.WaitGroup
// Start workers // Start workers
for i := 0; i < numWorkers; i++ { for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() { go func() {
defer wg.Done() for domain := range workChan {
for domain := range domainChan {
feedsFound, crawlErr := c.crawlHost(domain.Host) feedsFound, crawlErr := c.crawlHost(domain.Host)
errStr := "" errStr := ""
if crawlErr != nil { if crawlErr != nil {
errStr = crawlErr.Error() errStr = crawlErr.Error()
} }
resultChan <- crawlResult{ if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil {
host: domain.Host, fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err)
feedsFound: feedsFound,
lastError: errStr,
} }
} }
}() }()
} }
// Start result processor const fetchSize = 100
done := make(chan bool) for {
go func() { domains, err := c.GetUncheckedDomainsRandom(fetchSize)
for result := range resultChan { if err != nil {
if err := c.markDomainCrawled(result.host, result.feedsFound, result.lastError); err != nil { fmt.Printf("Error fetching domains: %v\n", err)
fmt.Printf("Error marking domain %s as crawled: %v\n", result.host, err)
}
} }
done <- true
}()
// Send domains to workers if len(domains) == 0 {
for _, domain := range domains { c.displayedCrawlRate = 0
domainChan <- domain time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains))
for _, domain := range domains {
workChan <- domain
}
time.Sleep(1 * time.Second)
}
}
// StartCheckLoop runs the feed checking loop independently
func (c *Crawler) StartCheckLoop() {
numWorkers := runtime.NumCPU()
if numWorkers < 1 {
numWorkers = 1
} }
close(domainChan) // Buffered channel for feed work
wg.Wait() workChan := make(chan *Feed, 256)
close(resultChan)
<-done
return nil // Start workers
for i := 0; i < numWorkers; i++ {
go func() {
for feed := range workChan {
c.CheckFeed(feed)
}
}()
}
const fetchSize = 100
for {
feeds, err := c.GetFeedsDueForCheck(fetchSize)
if err != nil {
fmt.Printf("Error fetching feeds: %v\n", err)
}
if len(feeds) == 0 {
c.displayedCheckRate = 0
time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds))
for _, feed := range feeds {
workChan <- feed
}
time.Sleep(1 * time.Second)
}
} }
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) { func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
atomic.AddInt32(&c.hostsProcessed, 1) atomic.AddInt32(&c.hostsProcessed, 1)
// Count feeds before crawling
initialCount, _ := c.GetFeedCount()
localVisited := make(map[string]bool) localVisited := make(map[string]bool)
pagesVisited := 0 pagesVisited := 0
@@ -148,9 +190,8 @@ func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited) c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited)
} }
// Count feeds after crawling // Count feeds found for this specific host
finalCount, _ := c.GetFeedCount() feedsFound, _ = c.GetFeedCountByHost(host)
feedsFound = finalCount - initialCount
if pagesVisited == 0 { if pagesVisited == 0 {
return feedsFound, fmt.Errorf("could not connect") return feedsFound, fmt.Errorf("could not connect")
+559 -239
View File
@@ -1,21 +1,20 @@
package main package main
import ( import (
"database/sql"
"encoding/json" "encoding/json"
"fmt" "fmt"
"html/template" "html/template"
"net/http" "net/http"
"sort"
"time" "time"
) )
// DashboardStats holds all statistics for the dashboard // DashboardStats holds all statistics for the dashboard
type DashboardStats struct { type DashboardStats struct {
// Domain stats // Domain stats
TotalDomains int `json:"total_domains"` TotalDomains int `json:"total_domains"`
CrawledDomains int `json:"crawled_domains"` CheckedDomains int `json:"checked_domains"`
UncrawledDomains int `json:"uncrawled_domains"` UncheckedDomains int `json:"unchecked_domains"`
ErrorDomains int `json:"error_domains"`
// Feed stats // Feed stats
TotalFeeds int `json:"total_feeds"` TotalFeeds int `json:"total_feeds"`
@@ -25,16 +24,8 @@ type DashboardStats struct {
// Crawl progress // Crawl progress
HostsProcessed int32 `json:"hosts_processed"` HostsProcessed int32 `json:"hosts_processed"`
CrawlRate float64 `json:"crawl_rate"` // domains per minute CrawlRate int `json:"crawl_rate"` // crawls per minute
CheckRate int `json:"check_rate"` // feed checks per minute
// Top TLDs by feed count
TopTLDs []TLDStat `json:"top_tlds"`
// Recent feeds
RecentFeeds []RecentFeed `json:"recent_feeds"`
// Top domains by feed count
TopDomains []DomainStat `json:"top_domains"`
// Timing // Timing
UpdatedAt time.Time `json:"updated_at"` UpdatedAt time.Time `json:"updated_at"`
@@ -57,13 +48,107 @@ type DomainStat struct {
FeedsFound int `json:"feeds_found"` FeedsFound int `json:"feeds_found"`
} }
// GetDashboardStats collects all statistics for the dashboard // commaFormat formats an integer with comma separators
func commaFormat(n int) string {
s := fmt.Sprintf("%d", n)
if len(s) <= 3 {
return s
}
var result []byte
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
result = append(result, ',')
}
result = append(result, byte(c))
}
return string(result)
}
// UpdateStats recalculates and caches dashboard statistics
func (c *Crawler) UpdateStats() {
fmt.Println("UpdateStats: calculating stats...")
stats, err := c.calculateStats()
if err != nil {
fmt.Printf("UpdateStats: error calculating stats: %v\n", err)
return
}
// Cache all domains with feeds (runs in background, so slow query is OK)
fmt.Println("UpdateStats: fetching all domains...")
allDomains := c.fetchAllDomainsFromDB()
fmt.Printf("UpdateStats: got %d domains\n", len(allDomains))
c.statsMu.Lock()
c.cachedStats = stats
c.cachedAllDomains = allDomains
c.statsMu.Unlock()
fmt.Println("UpdateStats: complete")
}
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
rows, err := c.db.Query(`
SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds
GROUP BY tld, sourceHost
ORDER BY tld, sourceHost
`)
if err != nil {
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
return nil
}
defer rows.Close()
var domains []DomainStat
for rows.Next() {
var ds DomainStat
var tld string
if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil {
continue
}
domains = append(domains, ds)
}
return domains
}
// GetDashboardStats returns cached statistics (returns empty stats if not yet cached)
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) { func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
c.statsMu.RLock()
stats := c.cachedStats
c.statsMu.RUnlock()
if stats != nil {
return stats, nil
}
// Return empty stats while background calculation runs (don't block HTTP requests)
return &DashboardStats{UpdatedAt: time.Now()}, nil
}
// calculateStats collects all statistics for the dashboard
func (c *Crawler) calculateStats() (*DashboardStats, error) {
stats := &DashboardStats{ stats := &DashboardStats{
UpdatedAt: time.Now(), UpdatedAt: time.Now(),
HostsProcessed: c.hostsProcessed, HostsProcessed: c.hostsProcessed,
} }
// Calculate crawl rate (crawls per minute), smoothed by +/-1 per update
elapsed := time.Since(c.startTime).Minutes()
if elapsed > 0 {
actualRate := int(float64(c.hostsProcessed) / elapsed)
if actualRate > c.displayedCrawlRate {
c.displayedCrawlRate++
} else if actualRate < c.displayedCrawlRate {
c.displayedCrawlRate--
}
stats.CrawlRate = c.displayedCrawlRate
// Calculate check rate (feed checks per minute), smoothed by +/-1 per update
actualCheckRate := int(float64(c.feedsChecked) / elapsed)
if actualCheckRate > c.displayedCheckRate {
c.displayedCheckRate++
} else if actualCheckRate < c.displayedCheckRate {
c.displayedCheckRate--
}
stats.CheckRate = c.displayedCheckRate
}
// Get domain stats // Get domain stats
if err := c.collectDomainStats(stats); err != nil { if err := c.collectDomainStats(stats); err != nil {
return nil, err return nil, err
@@ -78,148 +163,455 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
} }
func (c *Crawler) collectDomainStats(stats *DashboardStats) error { func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
iter, err := c.db.NewIter(nil) // Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains)
if err != nil { if err != nil {
return err return err
} }
defer iter.Close()
domainFeeds := make(map[string]int) // Single query to get all status counts (one index scan instead of three)
rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status")
if err != nil {
return err
}
defer rows.Close()
for iter.SeekGE([]byte("domain:")); iter.Valid(); iter.Next() { for rows.Next() {
key := string(iter.Key()) var status string
if len(key) < 7 || key[:7] != "domain:" { var count int
break if err := rows.Scan(&status, &count); err != nil {
}
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue continue
} }
switch status {
stats.TotalDomains++ case "checked":
switch domain.Status { stats.CheckedDomains = count
case "crawled": case "unchecked":
stats.CrawledDomains++ stats.UncheckedDomains = count
if domain.FeedsFound > 0 {
domainFeeds[domain.Host] = domain.FeedsFound
}
case "uncrawled":
stats.UncrawledDomains++
case "error":
stats.ErrorDomains++
} }
} }
if err := rows.Err(); err != nil {
// Top domains by feed count return err
type kv struct {
Host string
Count int
}
var sorted []kv
for h, c := range domainFeeds {
sorted = append(sorted, kv{h, c})
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Count > sorted[j].Count
})
for i := 0; i < len(sorted) && i < 10; i++ {
stats.TopDomains = append(stats.TopDomains, DomainStat{
Host: sorted[i].Host,
FeedsFound: sorted[i].Count,
})
} }
return iter.Error() return rows.Err()
} }
func (c *Crawler) collectFeedStats(stats *DashboardStats) error { func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
iter, err := c.db.NewIter(nil) // Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds)
if err != nil { if err != nil {
return err return err
} }
defer iter.Close()
tldCounts := make(map[string]int) // Single query to get all type counts (one index scan instead of three)
var recentFeeds []RecentFeed rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
if err != nil {
return err
}
defer rows.Close()
for iter.SeekGE([]byte("feed:")); iter.Valid(); iter.Next() { for rows.Next() {
key := string(iter.Key()) var feedType sql.NullString
if len(key) < 5 || key[:5] != "feed:" { var count int
break if err := rows.Scan(&feedType, &count); err != nil {
}
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
continue continue
} }
switch feedType.String {
stats.TotalFeeds++
switch feed.Type {
case "rss": case "rss":
stats.RSSFeeds++ stats.RSSFeeds = count
case "atom": case "atom":
stats.AtomFeeds++ stats.AtomFeeds = count
default: default:
stats.UnknownFeeds++ stats.UnknownFeeds += count
} }
if feed.TLD != "" {
tldCounts[feed.TLD]++
}
recentFeeds = append(recentFeeds, RecentFeed{
URL: feed.URL,
Title: feed.Title,
Type: feed.Type,
DiscoveredAt: feed.DiscoveredAt,
})
} }
return rows.Err()
// Top TLDs
type kv struct {
TLD string
Count int
}
var sortedTLDs []kv
for t, c := range tldCounts {
sortedTLDs = append(sortedTLDs, kv{t, c})
}
sort.Slice(sortedTLDs, func(i, j int) bool {
return sortedTLDs[i].Count > sortedTLDs[j].Count
})
for i := 0; i < len(sortedTLDs) && i < 10; i++ {
stats.TopTLDs = append(stats.TopTLDs, TLDStat{
TLD: sortedTLDs[i].TLD,
Count: sortedTLDs[i].Count,
})
}
// Recent feeds (last 20, sorted by discovery time)
sort.Slice(recentFeeds, func(i, j int) bool {
return recentFeeds[i].DiscoveredAt.After(recentFeeds[j].DiscoveredAt)
})
if len(recentFeeds) > 20 {
recentFeeds = recentFeeds[:20]
}
stats.RecentFeeds = recentFeeds
return iter.Error()
} }
// StartDashboard starts the web dashboard server // StartDashboard starts the web dashboard server
func (c *Crawler) StartDashboard(addr string) error { func (c *Crawler) StartDashboard(addr string) error {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
c.handleDashboard(w, r) c.handleDashboard(w, r)
}) })
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) { http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIStats(w, r) c.handleAPIStats(w, r)
}) })
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIAllDomains(w, r)
})
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomainFeeds(w, r)
})
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedInfo(w, r)
})
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedItems(w, r)
})
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISearch(w, r)
})
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
})
fmt.Printf("Dashboard running at http://%s\n", addr) fmt.Printf("Dashboard running at http://%s\n", addr)
return http.ListenAndServe(addr, nil) return http.ListenAndServe(addr, nil)
} }
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
rows, err := c.db.Query(`
SELECT url, title, type FROM feeds
WHERE sourceHost = ?
ORDER BY url ASC
LIMIT 1000
`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title sql.NullString
if err := rows.Scan(&f.URL, &title, &f.Type); err != nil {
continue
}
if title.Valid {
f.Title = title.String
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
type FeedDetails struct {
URL string `json:"url"`
Type string `json:"type,omitempty"`
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"siteUrl,omitempty"`
DiscoveredAt string `json:"discoveredAt,omitempty"`
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
LastBuildDate string `json:"lastBuildDate,omitempty"`
TTLMinutes int `json:"ttlMinutes,omitempty"`
UpdatePeriod string `json:"updatePeriod,omitempty"`
UpdateFreq int `json:"updateFreq,omitempty"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"errorCount,omitempty"`
LastError string `json:"lastError,omitempty"`
ItemCount int `json:"itemCount,omitempty"`
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
OldestItemDate string `json:"oldestItemDate,omitempty"`
NewestItemDate string `json:"newestItemDate,omitempty"`
}
var f FeedDetails
var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString
var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString
var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, lastBuildDate,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate
FROM feeds WHERE url = ?
`, feedURL).Scan(
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
&f.DiscoveredAt, &lastCrawledAt, &lastBuildDate,
&ttlMinutes, &updatePeriod, &updateFreq,
&status, &errorCount, &lastError,
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
)
if err == sql.ErrNoRows {
http.Error(w, "feed not found", http.StatusNotFound)
return
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if title.Valid {
f.Title = title.String
}
if description.Valid {
f.Description = description.String
}
if language.Valid {
f.Language = language.String
}
if siteUrl.Valid {
f.SiteURL = siteUrl.String
}
if lastCrawledAt.Valid {
f.LastCrawledAt = lastCrawledAt.String
}
if lastBuildDate.Valid {
f.LastBuildDate = lastBuildDate.String
}
if ttlMinutes.Valid {
f.TTLMinutes = int(ttlMinutes.Int64)
}
if updatePeriod.Valid {
f.UpdatePeriod = updatePeriod.String
}
if updateFreq.Valid {
f.UpdateFreq = int(updateFreq.Int64)
}
if status.Valid {
f.Status = status.String
}
if errorCount.Valid {
f.ErrorCount = int(errorCount.Int64)
}
if lastError.Valid {
f.LastError = lastError.String
}
if itemCount.Valid {
f.ItemCount = int(itemCount.Int64)
}
if avgPostFreqHrs.Valid {
f.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
f.OldestItemDate = oldestItemDate.String
}
if newestItemDate.Valid {
f.NewestItemDate = newestItemDate.String
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(f)
}
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
limit := 50
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
items, err := c.GetItemsByFeed(feedURL, limit)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if items == nil {
items = []*Item{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(items)
}
// SearchResult represents a search result with feed and matching items
type SearchResult struct {
Feed SearchFeed `json:"feed"`
Items []SearchItem `json:"items"`
}
type SearchFeed struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
Type string `json:"type"`
SourceHost string `json:"source_host"`
Status string `json:"status"`
}
type SearchItem struct {
ID int64 `json:"id"`
Title string `json:"title"`
Link string `json:"link"`
Description string `json:"description"`
Author string `json:"author"`
PubDate string `json:"pub_date"`
}
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
if query == "" {
http.Error(w, "q parameter required", http.StatusBadRequest)
return
}
limit := 100
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
// Results map: feedURL -> SearchResult
results := make(map[string]*SearchResult)
// Search feeds
feedRows, err := c.db.Query(`
SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
LIMIT ?
`, query, limit)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
var url string
var title, description, feedType, sourceHost, status sql.NullString
if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil {
continue
}
results[url] = &SearchResult{
Feed: SearchFeed{
URL: url,
Title: title.String,
Description: description.String,
Type: feedType.String,
SourceHost: sourceHost.String,
Status: status.String,
},
Items: []SearchItem{},
}
}
}
// Search items
itemRows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err == nil {
defer itemRows.Close()
for itemRows.Next() {
var id int64
var feedUrl string
var title, link, description, author, pubDate sql.NullString
if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil {
continue
}
item := SearchItem{
ID: id,
Title: title.String,
Link: link.String,
Description: description.String,
Author: author.String,
PubDate: pubDate.String,
}
// Add to existing result or create new one
if result, exists := results[feedUrl]; exists {
result.Items = append(result.Items, item)
} else {
// Fetch feed info for this item's feed
var fTitle, fDesc, fType, fHost, fStatus sql.NullString
c.db.QueryRow(`
SELECT title, description, type, sourceHost, status
FROM feeds WHERE url = ?
`, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus)
results[feedUrl] = &SearchResult{
Feed: SearchFeed{
URL: feedUrl,
Title: fTitle.String,
Description: fDesc.String,
Type: fType.String,
SourceHost: fHost.String,
Status: fStatus.String,
},
Items: []SearchItem{item},
}
}
}
}
// Convert map to slice
var resultList []SearchResult
for _, r := range results {
resultList = append(resultList, *r)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resultList)
}
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) { func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats() stats, err := c.GetDashboardStats()
if err != nil { if err != nil {
@@ -228,14 +620,28 @@ func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
} }
funcMap := template.FuncMap{ funcMap := template.FuncMap{
"divf": func(a, b int) float64 { "pct": func(a, b int) float64 {
if b == 0 { if b == 0 {
return 0 return 0
} }
return float64(a) / float64(b) return float64(a) * 100.0 / float64(b)
}, },
"mulf": func(a int, b float64) float64 { "comma": func(n interface{}) string {
return float64(a) * b var val int
switch v := n.(type) {
case int:
val = v
case int32:
val = int(v)
case int64:
val = int(v)
default:
return "0"
}
if val < 0 {
return "-" + commaFormat(-val)
}
return commaFormat(val)
}, },
} }
@@ -265,58 +671,8 @@ const dashboardHTML = `<!DOCTYPE html>
<head> <head>
<title>1440.news Feed Crawler</title> <title>1440.news Feed Crawler</title>
<meta charset="utf-8"> <meta charset="utf-8">
<meta http-equiv="refresh" content="5"> <link rel="stylesheet" href="/static/dashboard.css">
<style> <script src="/static/dashboard.js"></script>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
background: #0a0a0a;
color: #e0e0e0;
padding: 20px;
line-height: 1.6;
}
h1 { color: #fff; margin-bottom: 20px; font-size: 24px; }
h2 { color: #888; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
.card {
background: #151515;
border: 1px solid #252525;
border-radius: 8px;
padding: 15px;
}
.stat-value { font-size: 32px; font-weight: bold; color: #fff; }
.stat-label { font-size: 12px; color: #666; text-transform: uppercase; }
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; }
.stat-row:last-child { border-bottom: none; }
.progress-bar {
background: #202020;
border-radius: 4px;
height: 8px;
margin-top: 10px;
overflow: hidden;
}
.progress-fill {
background: linear-gradient(90deg, #00aa55, #00cc66);
height: 100%;
transition: width 0.3s;
}
table { width: 100%; border-collapse: collapse; }
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
th { color: #666; font-size: 11px; text-transform: uppercase; }
td { font-size: 13px; }
.type-rss { color: #f90; }
.type-atom { color: #09f; }
.type-unknown { color: #666; }
.url {
max-width: 400px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
color: #4a9eff;
}
.time { color: #666; font-size: 12px; }
.updated { color: #444; font-size: 11px; text-align: right; margin-top: 20px; }
</style>
</head> </head>
<body> <body>
<h1>1440.news Feed Crawler</h1> <h1>1440.news Feed Crawler</h1>
@@ -324,99 +680,63 @@ const dashboardHTML = `<!DOCTYPE html>
<h2>Crawl Progress</h2> <h2>Crawl Progress</h2>
<div class="grid"> <div class="grid">
<div class="card"> <div class="card">
<div class="stat-value">{{.TotalDomains}}</div> <div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
<div class="stat-label">Total Domains</div> <div class="stat-label">Domains</div>
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value">{{.CrawledDomains}}</div> <div class="stat-value" id="checkedDomains">{{comma .CheckedDomains}}</div>
<div class="stat-label">Crawled</div> <div class="stat-label">Checked</div>
{{if .TotalDomains}}
<div class="progress-bar"> <div class="progress-bar">
<div class="progress-fill" style="width: {{printf "%.1f" (divf (mulf .CrawledDomains 100.0) .TotalDomains)}}%"></div> <div class="progress-fill" id="crawlProgress" style="width: {{printf "%.1f" (pct .CheckedDomains .TotalDomains)}}%"></div>
</div> </div>
{{end}}
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value">{{.UncrawledDomains}}</div> <div class="stat-value" id="uncheckedDomains">{{comma .UncheckedDomains}}</div>
<div class="stat-label">Uncrawled</div> <div class="stat-label">Unchecked</div>
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value">{{.ErrorDomains}}</div> <div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
<div class="stat-label">Errors</div> <div class="stat-label">crawls per min</div>
</div>
<div class="card">
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
<div class="stat-label">checks per min</div>
</div> </div>
</div> </div>
<h2>Feeds Discovered</h2> <h2>Feeds Discovered</h2>
<div class="grid"> <div class="grid">
<div class="card"> <div class="card">
<div class="stat-value">{{.TotalFeeds}}</div> <div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
<div class="stat-label">Total Feeds</div> <div class="stat-label">Total Feeds</div>
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value" style="color: #f90">{{.RSSFeeds}}</div> <div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
<div class="stat-label">RSS Feeds</div> <div class="stat-label">RSS Feeds</div>
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value" style="color: #09f">{{.AtomFeeds}}</div> <div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
<div class="stat-label">Atom Feeds</div> <div class="stat-label">Atom Feeds</div>
</div> </div>
<div class="card"> <div class="card">
<div class="stat-value" style="color: #666">{{.UnknownFeeds}}</div> <div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
<div class="stat-label">Unknown Type</div> <div class="stat-label">Unknown Type</div>
</div> </div>
</div> </div>
<div class="grid" style="grid-template-columns: 1fr 1fr;">
<div class="card">
<h2 style="margin-top: 0;">Top TLDs</h2>
{{range .TopTLDs}}
<div class="stat-row">
<span>.{{.TLD}}</span>
<span>{{.Count}}</span>
</div>
{{else}}
<div style="color: #444;">No data yet</div>
{{end}}
</div>
<div class="card">
<h2 style="margin-top: 0;">Top Domains</h2>
{{range .TopDomains}}
<div class="stat-row">
<span>{{.Host}}</span>
<span>{{.FeedsFound}}</span>
</div>
{{else}}
<div style="color: #444;">No data yet</div>
{{end}}
</div>
</div>
<h2>Recent Feeds</h2>
<div class="card"> <div class="card">
<table> <h2 style="margin-top: 0;">Feeds</h2>
<thead> <div style="margin-bottom: 15px;">
<tr> <input type="text" id="searchInput" placeholder="Search feeds and items..."
<th>URL</th> style="width: 100%; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px;">
<th>Title</th> </div>
<th>Type</th> <div id="searchResults" style="display: none;"></div>
<th>Discovered</th> <div id="allDomainsContainer">
</tr> <div id="allDomains"></div>
</thead> <div id="allDomainsLoading" style="text-align: center; padding: 10px; color: #666;">Loading...</div>
<tbody> </div>
{{range .RecentFeeds}}
<tr>
<td class="url">{{.URL}}</td>
<td>{{if .Title}}{{.Title}}{{else}}-{{end}}</td>
<td class="type-{{.Type}}">{{.Type}}</td>
<td class="time">{{.DiscoveredAt.Format "15:04:05"}}</td>
</tr>
{{else}}
<tr><td colspan="4" style="color: #444;">No feeds discovered yet</td></tr>
{{end}}
</tbody>
</table>
</div> </div>
<div class="updated">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div> <div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
</body> </body>
</html>` </html>`
+192
View File
@@ -0,0 +1,192 @@
package main
import (
"database/sql"
"fmt"
_ "modernc.org/sqlite"
)
const schema = `
CREATE TABLE IF NOT EXISTS domains (
host TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'unchecked',
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
feedsFound INTEGER DEFAULT 0,
lastError TEXT,
tld TEXT
);
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0;
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
title TEXT,
description TEXT,
language TEXT,
siteUrl TEXT,
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
nextCrawlAt DATETIME,
lastBuildDate DATETIME,
etag TEXT,
lastModified TEXT,
ttlMinutes INTEGER,
updatePeriod TEXT,
updateFreq INTEGER,
status TEXT DEFAULT 'active',
errorCount INTEGER DEFAULT 0,
lastError TEXT,
lastErrorAt DATETIME,
sourceUrl TEXT,
sourceHost TEXT,
tld TEXT,
itemCount INTEGER,
avgPostFreqHrs REAL,
oldestItemDate DATETIME,
newestItemDate DATETIME,
noUpdate INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feedUrl TEXT NOT NULL,
guid TEXT,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
UNIQUE(feedUrl, guid)
);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
url,
title,
description,
content='feeds',
content_rowid='rowid'
);
-- Triggers to keep FTS in sync
CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
-- Full-text search for items
CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5(
title,
description,
content,
author,
content='items',
content_rowid='id'
);
-- Triggers to keep items FTS in sync
CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
END;
CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
`
func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Allow multiple readers (WAL mode supports concurrent reads)
// SQLite is single-writer, but reads can happen concurrently
db.SetMaxOpenConns(4)
// Verify connection and show journal mode
var journalMode string
if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil {
fmt.Printf(" Warning: could not query journal_mode: %v\n", err)
} else {
fmt.Printf(" Journal mode: %s\n", journalMode)
}
// Create schema
if _, err := db.Exec(schema); err != nil {
db.Close()
return nil, fmt.Errorf("failed to create schema: %v", err)
}
fmt.Println(" Schema OK")
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount)
db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount)
fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount)
fmt.Println(" Running ANALYZE...")
if _, err := db.Exec("ANALYZE"); err != nil {
fmt.Printf(" Warning: ANALYZE failed: %v\n", err)
} else {
fmt.Println(" ANALYZE complete")
}
}()
return db, nil
}
+296 -122
View File
@@ -3,20 +3,19 @@ package main
import ( import (
"bufio" "bufio"
"compress/gzip" "compress/gzip"
"encoding/json" "database/sql"
"fmt" "fmt"
"io" "io"
"os" "os"
"strings" "strings"
"sync/atomic"
"time" "time"
"github.com/cockroachdb/pebble"
) )
// Domain represents a host to be crawled for feeds // Domain represents a host to be crawled for feeds
type Domain struct { type Domain struct {
Host string `json:"host"` // Normalized hostname (no scheme, no www.) Host string `json:"host"`
Status string `json:"status"` // "uncrawled", "crawled", "error" Status string `json:"status"`
DiscoveredAt time.Time `json:"discovered_at"` DiscoveredAt time.Time `json:"discovered_at"`
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"` LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
FeedsFound int `json:"feeds_found,omitempty"` FeedsFound int `json:"feeds_found,omitempty"`
@@ -24,130 +23,162 @@ type Domain struct {
TLD string `json:"tld,omitempty"` TLD string `json:"tld,omitempty"`
} }
// saveDomain stores a domain in PebbleDB // saveDomain stores a domain in SQLite
func (c *Crawler) saveDomain(domain *Domain) error { func (c *Crawler) saveDomain(domain *Domain) error {
data, err := json.Marshal(domain) _, err := c.db.Exec(`
if err != nil { INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
return fmt.Errorf("failed to marshal domain: %v", err) VALUES (?, ?, ?, ?, ?, ?, ?)
} ON CONFLICT(host) DO UPDATE SET
status = excluded.status,
key := []byte("domain:" + domain.Host) lastCrawledAt = excluded.lastCrawledAt,
return c.db.Set(key, data, pebble.Sync) feedsFound = excluded.feedsFound,
lastError = excluded.lastError,
tld = excluded.tld
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
return err
} }
// getDomain retrieves a domain from PebbleDB // saveDomainTx stores a domain using a transaction
func (c *Crawler) getDomain(host string) (*Domain, error) { func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error {
key := []byte("domain:" + normalizeHost(host)) _, err := tx.Exec(`
data, closer, err := c.db.Get(key) INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
if err != nil { VALUES (?, ?, ?, ?, ?, ?, ?)
if err == pebble.ErrNotFound { ON CONFLICT(host) DO NOTHING
return nil, nil `, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
} domain.FeedsFound, nullString(domain.LastError), domain.TLD)
return nil, err return err
}
defer closer.Close()
var domain Domain
if err := json.Unmarshal(data, &domain); err != nil {
return nil, fmt.Errorf("failed to unmarshal domain: %v", err)
}
return &domain, nil
} }
// domainExists checks if a domain already exists in the database // domainExists checks if a domain already exists in the database
func (c *Crawler) domainExists(host string) bool { func (c *Crawler) domainExists(host string) bool {
key := []byte("domain:" + normalizeHost(host)) var exists bool
_, closer, err := c.db.Get(key) err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists)
if err != nil { return err == nil && exists
return false
}
closer.Close()
return true
} }
// GetUncrawledDomains returns all domains with status "uncrawled" // getDomain retrieves a domain from SQLite
func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) { func (c *Crawler) getDomain(host string) (*Domain, error) {
var domains []*Domain domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
iter, err := c.db.NewIter(&pebble.IterOptions{ err := c.db.QueryRow(`
LowerBound: []byte("domain:"), SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
UpperBound: []byte("domain:\xff"), FROM domains WHERE host = ?
}) `, normalizeHost(host)).Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() { if lastCrawledAt.Valid {
var domain Domain domain.LastCrawledAt = lastCrawledAt.Time
if err := json.Unmarshal(iter.Value(), &domain); err != nil { }
continue if lastError.Valid {
} domain.LastError = lastError.String
if domain.Status == "uncrawled" {
domains = append(domains, &domain)
}
} }
if err := iter.Error(); err != nil { return domain, nil
}
// GetUncheckedDomains returns all domains with status "unchecked"
func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
`)
if err != nil {
return nil, err return nil, err
} }
defer rows.Close()
return domains, nil return c.scanDomains(rows)
}
// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
ORDER BY RANDOM()
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return c.scanDomains(rows)
}
// scanDomains is a helper to scan multiple domain rows
func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
var domains []*Domain
for rows.Next() {
domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
if err := rows.Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
); err != nil {
continue
}
if lastCrawledAt.Valid {
domain.LastCrawledAt = lastCrawledAt.Time
}
if lastError.Valid {
domain.LastError = lastError.String
}
domains = append(domains, domain)
}
return domains, rows.Err()
} }
// markDomainCrawled updates a domain's status after crawling // markDomainCrawled updates a domain's status after crawling
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error { func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
domain, err := c.getDomain(host) status := "checked"
if err != nil {
return err
}
if domain == nil {
return fmt.Errorf("domain not found: %s", host)
}
domain.LastCrawledAt = time.Now()
domain.FeedsFound = feedsFound
if lastError != "" { if lastError != "" {
domain.Status = "error" status = "error"
domain.LastError = lastError
} else {
domain.Status = "crawled"
domain.LastError = ""
} }
return c.saveDomain(domain) var err error
if lastError != "" {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ?
WHERE host = ?
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL
WHERE host = ?
`, status, time.Now(), feedsFound, normalizeHost(host))
}
return err
} }
// GetDomainCount returns the total number of domains in the database // GetDomainCount returns the total number of domains in the database
func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) { func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
iter, err := c.db.NewIter(&pebble.IterOptions{ err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total)
LowerBound: []byte("domain:"),
UpperBound: []byte("domain:\xff"),
})
if err != nil { if err != nil {
return 0, 0, err return 0, 0, err
} }
defer iter.Close() err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked)
return total, unchecked, err
for iter.First(); iter.Valid(); iter.Next() {
total++
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue
}
if domain.Status == "uncrawled" {
uncrawled++
}
}
if err := iter.Error(); err != nil {
return 0, 0, err
}
return total, uncrawled, nil
} }
// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled" // ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) { func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
@@ -158,6 +189,110 @@ func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported in
return c.parseAndStoreDomains(file, limit) return c.parseAndStoreDomains(file, limit)
} }
// ImportDomainsInBackground starts domain import in a background goroutine
func (c *Crawler) ImportDomainsInBackground(filename string) {
go func() {
file, err := os.Open(filename)
if err != nil {
fmt.Printf("Failed to open vertices file: %v\n", err)
return
}
defer file.Close()
var bodyReader io.Reader
bufReader := bufio.NewReader(file)
peekBytes, err := bufReader.Peek(2)
if err != nil && err != io.EOF {
fmt.Printf("Failed to peek at file: %v\n", err)
return
}
if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
gzReader, err := gzip.NewReader(bufReader)
if err != nil {
fmt.Printf("Failed to create gzip reader: %v\n", err)
return
}
defer gzReader.Close()
bodyReader = gzReader
} else {
bodyReader = bufReader
}
scanner := bufio.NewScanner(bodyReader)
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
const batchSize = 10000
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
totalImported := 0
batchCount := 0
type domainEntry struct {
host string
tld string
}
for {
// Read and canonicalize batch
var domains []domainEntry
for len(domains) < batchSize && scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
}
}
}
if len(domains) == 0 {
break
}
// Build bulk INSERT statement
var sb strings.Builder
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
args := make([]interface{}, 0, len(domains)*4)
for i, d := range domains {
if i > 0 {
sb.WriteString(",")
}
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, err := c.db.Exec(sb.String(), args...)
imported := 0
if err != nil {
fmt.Printf("Bulk insert error: %v\n", err)
} else {
rowsAffected, _ := result.RowsAffected()
imported = int(rowsAffected)
}
batchCount++
totalImported += imported
atomic.AddInt32(&c.domainsImported, int32(imported))
// Wait 1 second before the next batch
time.Sleep(1 * time.Second)
}
if err := scanner.Err(); err != nil {
fmt.Printf("Error reading vertices file: %v\n", err)
}
fmt.Printf("Background import complete: %d domains imported\n", totalImported)
}()
}
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) { func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
var bodyReader io.Reader var bodyReader io.Reader
@@ -183,39 +318,63 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
scanner.Buffer(buf, 1024*1024) scanner.Buffer(buf, 1024*1024)
now := time.Now() now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
count := 0 count := 0
const batchSize = 1000
for scanner.Scan() { type domainEntry struct {
if limit > 0 && count >= limit { host string
tld string
}
for {
// Read and canonicalize batch
var domains []domainEntry
for len(domains) < batchSize && scanner.Scan() {
if limit > 0 && count >= limit {
break
}
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
count++
}
}
}
if len(domains) == 0 {
break break
} }
line := scanner.Text() // Build bulk INSERT statement
parts := strings.Split(line, "\t") var sb strings.Builder
if len(parts) >= 2 { sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
reverseHostName := strings.TrimSpace(parts[1]) args := make([]interface{}, 0, len(domains)*4)
if reverseHostName != "" { for i, d := range domains {
host := normalizeHost(reverseHost(reverseHostName)) if i > 0 {
count++ sb.WriteString(",")
// Skip if domain already exists
if c.domainExists(host) {
skipped++
continue
}
// Store new domain as uncrawled
domain := &Domain{
Host: host,
Status: "uncrawled",
DiscoveredAt: now,
TLD: getTLD(host),
}
if err := c.saveDomain(domain); err != nil {
continue
}
imported++
} }
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, execErr := c.db.Exec(sb.String(), args...)
if execErr != nil {
skipped += len(domains)
continue
}
rowsAffected, _ := result.RowsAffected()
imported += int(rowsAffected)
skipped += len(domains) - int(rowsAffected)
if limit > 0 && count >= limit {
break
} }
} }
@@ -225,3 +384,18 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
return imported, skipped, nil return imported, skipped, nil
} }
// Helper functions for SQL null handling
func nullTime(t time.Time) sql.NullTime {
if t.IsZero() {
return sql.NullTime{}
}
return sql.NullTime{Time: t, Valid: true}
}
func nullString(s string) sql.NullString {
if s == "" {
return sql.NullString{}
}
return sql.NullString{String: s, Valid: true}
}
+734 -69
View File
@@ -1,15 +1,86 @@
package main package main
import ( import (
"encoding/json" "database/sql"
"fmt" "fmt"
"io"
"net/http" "net/http"
"net/url"
"regexp"
"strings" "strings"
"sync/atomic"
"time" "time"
"github.com/cockroachdb/pebble"
) )
// shouldSkipFeed checks if a feed URL should be filtered out
// Returns true (and a reason) if the feed should be skipped
func shouldSkipFeed(feedURL string) (bool, string) {
lower := strings.ToLower(feedURL)
// Skip explicit comment feeds
if strings.Contains(lower, "/comment") {
return true, "comment feed"
}
u, err := url.Parse(feedURL)
if err != nil {
return false, ""
}
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
// Skip category/tag feeds
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
for _, pattern := range categoryPatterns {
if strings.Contains(path, pattern) {
return true, "category/tag feed"
}
}
// Check for article comment feeds (path ending in /feed with content before it)
if strings.HasSuffix(path, "/feed") {
basePath := strings.TrimSuffix(path, "/feed")
basePath = strings.Trim(basePath, "/")
if basePath == "" {
return false, "" // Just /feed - legitimate main feed
}
// Skip if path contains date patterns (likely article)
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
return true, "article feed (date pattern)"
}
// Skip if path has multiple segments (likely article or nested content)
segments := strings.Split(basePath, "/")
if len(segments) >= 2 {
return true, "article feed (nested path)"
}
// Skip if single segment looks like an article slug (contains hyphens, is long)
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
return true, "article feed (slug pattern)"
}
}
return false, ""
}
// Item represents an individual entry/article from a feed
type Item struct {
ID int64 `json:"id,omitempty"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata // Feed represents a discovered RSS/Atom feed with metadata
type Feed struct { type Feed struct {
URL string `json:"url"` URL string `json:"url"`
@@ -50,99 +121,548 @@ type Feed struct {
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
OldestItemDate time.Time `json:"oldest_item_date,omitempty"` OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
NewestItemDate time.Time `json:"newest_item_date,omitempty"` NewestItemDate time.Time `json:"newest_item_date,omitempty"`
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
} }
// saveFeed stores a feed in PebbleDB // saveFeed stores a feed in SQLite
func (c *Crawler) saveFeed(feed *Feed) error { func (c *Crawler) saveFeed(feed *Feed) error {
data, err := json.Marshal(feed) _, err := c.db.Exec(`
if err != nil { INSERT INTO feeds (
return fmt.Errorf("failed to marshal feed: %v", err) url, type, title, description, language, siteUrl,
} discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
key := []byte("feed:" + feed.URL) ttlMinutes, updatePeriod, updateFreq,
return c.db.Set(key, data, pebble.Sync) status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
type = excluded.type,
title = excluded.title,
description = excluded.description,
language = excluded.language,
siteUrl = excluded.siteUrl,
lastCrawledAt = excluded.lastCrawledAt,
nextCrawlAt = excluded.nextCrawlAt,
lastBuildDate = excluded.lastBuildDate,
etag = excluded.etag,
lastModified = excluded.lastModified,
ttlMinutes = excluded.ttlMinutes,
updatePeriod = excluded.updatePeriod,
updateFreq = excluded.updateFreq,
status = excluded.status,
errorCount = excluded.errorCount,
lastError = excluded.lastError,
lastErrorAt = excluded.lastErrorAt,
itemCount = excluded.itemCount,
avgPostFreqHrs = excluded.avgPostFreqHrs,
oldestItemDate = excluded.oldestItemDate,
newestItemDate = excluded.newestItemDate,
noUpdate = excluded.noUpdate
`,
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
nullString(feed.Language), nullString(feed.SiteURL),
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
nullString(feed.ETag), nullString(feed.LastModified),
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
feed.NoUpdate,
)
return err
} }
// getFeed retrieves a feed from PebbleDB // getFeed retrieves a feed from SQLite
func (c *Crawler) getFeed(feedURL string) (*Feed, error) { func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
key := []byte("feed:" + normalizeURL(feedURL)) feed := &Feed{}
data, closer, err := c.db.Get(key) var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE url = ?
`, normalizeURL(feedURL)).Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil { if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err return nil, err
} }
defer closer.Close()
var feed Feed // Handle nullable fields
if err := json.Unmarshal(data, &feed); err != nil { if title.Valid {
return nil, fmt.Errorf("failed to unmarshal feed: %v", err) feed.Title = title.String
} }
return &feed, nil if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
return feed, nil
} }
// feedExists checks if a feed URL already exists in the database // feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool { func (c *Crawler) feedExists(feedURL string) bool {
key := []byte("feed:" + normalizeURL(feedURL)) var exists bool
_, closer, err := c.db.Get(key) err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
if err != nil { return err == nil && exists
return false
}
closer.Close()
return true
} }
// GetAllFeeds returns all feeds from the database // GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) { func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
var feeds []*Feed rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
iter, err := c.db.NewIter(&pebble.IterOptions{ discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
LowerBound: []byte("feed:"), etag, lastModified,
UpperBound: []byte("feed:\xff"), ttlMinutes, updatePeriod, updateFreq,
}) status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
`)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer iter.Close() defer rows.Close()
for iter.First(); iter.Valid(); iter.Next() { return scanFeeds(rows)
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
continue
}
feeds = append(feeds, &feed)
}
if err := iter.Error(); err != nil {
return nil, err
}
return feeds, nil
} }
// GetFeedCount returns the total number of feeds in the database // GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) { func (c *Crawler) GetFeedCount() (int, error) {
count := 0 var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
return count, err
}
iter, err := c.db.NewIter(&pebble.IterOptions{ // GetFeedCountByHost returns the number of feeds for a specific host
LowerBound: []byte("feed:"), func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
UpperBound: []byte("feed:\xff"), var count int
}) err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
return count, err
}
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
ORDER BY RANDOM()
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE sourceHost = ?
`, host)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
f.etag, f.lastModified,
f.ttlMinutes, f.updatePeriod, f.updateFreq,
f.status, f.errorCount, f.lastError, f.lastErrorAt,
f.sourceUrl, f.sourceHost, f.tld,
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
f.noUpdate
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
`, query)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// scanFeeds is a helper to scan multiple feed rows
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
var feeds []*Feed
for rows.Next() {
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
if err := rows.Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
); err != nil {
continue
}
// Handle nullable fields
if title.Valid {
feed.Title = title.String
}
if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
feeds = append(feeds, feed)
}
return feeds, rows.Err()
}
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
func (c *Crawler) saveItem(item *Item) error {
_, err := c.db.Exec(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`,
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
stmt, err := tx.Prepare(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`)
if err != nil {
return err
}
defer stmt.Close()
for _, item := range items {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
_, err := stmt.Exec(
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit()
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
FROM items
WHERE feedUrl = ?
ORDER BY pubDate DESC
LIMIT ?
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
`, cutoff)
if err != nil { if err != nil {
return 0, err return 0, err
} }
defer iter.Close() return result.RowsAffected()
for iter.First(); iter.Valid(); iter.Next() {
count++
}
if err := iter.Error(); err != nil {
return 0, err
}
return count, nil
} }
// processFeed parses and stores a feed with full metadata // processFeed parses and stores a feed with full metadata
@@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
LastModified: headers.Get("Last-Modified"), LastModified: headers.Get("Last-Modified"),
} }
// Parse feed-specific metadata // Parse feed-specific metadata and items
var items []*Item
switch feedType { switch feedType {
case "rss": case "rss":
c.parseRSSMetadata(body, feed) items = c.parseRSSMetadata(body, feed)
case "atom": case "atom":
c.parseAtomMetadata(body, feed) items = c.parseAtomMetadata(body, feed)
} }
// Calculate next crawl time // Calculate next crawl time
@@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
if err := c.saveFeed(feed); err != nil { if err := c.saveFeed(feed); err != nil {
return return
} }
// Save items
if len(items) > 0 {
c.saveItems(items)
}
} }
// addFeed adds a discovered feed URL (not yet fetched) // addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
if strings.Contains(feedURL, "/comment") { // Skip comment, category, and article feeds
if skip, _ := shouldSkipFeed(feedURL); skip {
return return
} }
@@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
return return
} }
} }
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: changed (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
var successURL string
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
successURL = tryURL
break
}
}
_ = successURL // May be used later for logging/debugging
// If no request succeeded, resp will be nil
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
now := time.Now()
feed.LastCrawledAt = now
// 304 Not Modified - feed hasn't changed
if resp.StatusCode == http.StatusNotModified {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
c.saveFeed(feed)
return false, nil
}
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
feedType := c.detectFeedType(body)
feed.Type = feedType
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
}
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
// Save items
if len(items) > 0 {
c.saveItems(items)
}
return true, nil
}
+26 -7
View File
@@ -6,7 +6,13 @@ import (
) )
func main() { func main() {
crawler, err := NewCrawler("feeds.db") // Ensure feeds directory exists
if err := os.MkdirAll("feeds", 0755); err != nil {
fmt.Fprintf(os.Stderr, "Error creating feeds directory: %v\n", err)
os.Exit(1)
}
crawler, err := NewCrawler("feeds/feeds.db")
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err) fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1) os.Exit(1)
@@ -20,11 +26,24 @@ func main() {
} }
}() }()
// Import domains from vertices file (only adds new ones as "uncrawled") // Initialize stats in background (can be slow with large DBs)
crawler.ImportDomainsFromFile("vertices.txt.gz", 0) go crawler.UpdateStats()
// Crawl all uncrawled domains (runs continuously) // Start all loops independently
for { fmt.Println("Starting import, crawl, check, and stats loops...")
crawler.CrawlUncrawledDomains()
} // Import loop (background)
go crawler.ImportDomainsInBackground("vertices.txt.gz")
// Check loop (background)
go crawler.StartCheckLoop()
// Stats loop (background) - updates once per minute
go crawler.StartStatsLoop()
// Cleanup loop (background) - removes old items once per hour
go crawler.StartCleanupLoop()
// Crawl loop (foreground - blocks forever)
crawler.StartCrawlLoop()
} }
+109 -16
View File
@@ -26,9 +26,14 @@ type RSSChannel struct {
} }
type RSSItem struct { type RSSItem struct {
Title string `xml:"title"` Title string `xml:"title"`
Link string `xml:"link"` Link string `xml:"link"`
PubDate string `xml:"pubDate"` GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
} }
// Atom structs for parsing // Atom structs for parsing
@@ -40,10 +45,23 @@ type AtomFeed struct {
} }
type AtomEntry struct { type AtomEntry struct {
Title string `xml:"title"` ID string `xml:"id"`
Links []AtomLink `xml:"link"` Title string `xml:"title"`
Updated string `xml:"updated"` Links []AtomLink `xml:"link"`
Published string `xml:"published"` Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
}
type AtomContent struct {
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type AtomAuthor struct {
Name string `xml:"name"`
} }
type AtomLink struct { type AtomLink struct {
@@ -52,10 +70,10 @@ type AtomLink struct {
Type string `xml:"type,attr"` Type string `xml:"type,attr"`
} }
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) { func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil { if err := xml.Unmarshal([]byte(body), &rss); err != nil {
return return nil
} }
ch := rss.Channel ch := rss.Channel
@@ -75,16 +93,47 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
} }
} }
// Analyze item dates // Parse items
now := time.Now()
var items []*Item
var dates []time.Time var dates []time.Time
for _, item := range ch.Items {
if item.PubDate != "" { for _, rssItem := range ch.Items {
if t, err := parseRSSDate(item.PubDate); err == nil { item := &Item{
FeedURL: feed.URL,
Title: rssItem.Title,
Link: rssItem.Link,
Description: rssItem.Description,
Content: rssItem.Content,
DiscoveredAt: now,
}
// Use GUID if available, otherwise use link
if rssItem.GUID != "" {
item.GUID = rssItem.GUID
} else if rssItem.Link != "" {
item.GUID = rssItem.Link
}
// Author: prefer author, fall back to dc:creator
if rssItem.Author != "" {
item.Author = rssItem.Author
} else if rssItem.Creator != "" {
item.Author = rssItem.Creator
}
// Parse pubDate
if rssItem.PubDate != "" {
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
item.PubDate = t
dates = append(dates, t) dates = append(dates, t)
} }
} }
items = append(items, item)
} }
// Calculate date stats
if len(dates) > 0 { if len(dates) > 0 {
oldest, newest := dates[0], dates[0] oldest, newest := dates[0], dates[0]
for _, d := range dates { for _, d := range dates {
@@ -103,12 +152,14 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
} }
} }
return items
} }
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) { func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
var atom AtomFeed var atom AtomFeed
if err := xml.Unmarshal([]byte(body), &atom); err != nil { if err := xml.Unmarshal([]byte(body), &atom); err != nil {
return return nil
} }
feed.Title = atom.Title feed.Title = atom.Title
@@ -131,20 +182,60 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
} }
} }
// Analyze entry dates // Parse entries
now := time.Now()
var items []*Item
var dates []time.Time var dates []time.Time
for _, entry := range atom.Entries { for _, entry := range atom.Entries {
item := &Item{
FeedURL: feed.URL,
Title: entry.Title,
Author: entry.Author.Name,
DiscoveredAt: now,
}
// Use ID as GUID
if entry.ID != "" {
item.GUID = entry.ID
}
// Get link (prefer alternate, fall back to first link)
for _, link := range entry.Links {
if link.Rel == "" || link.Rel == "alternate" {
item.Link = link.Href
break
}
}
if item.Link == "" && len(entry.Links) > 0 {
item.Link = entry.Links[0].Href
}
// Use ID as GUID fallback if not set
if item.GUID == "" && item.Link != "" {
item.GUID = item.Link
}
// Summary/Content
item.Description = entry.Summary
item.Content = entry.Content.Value
// Parse dates
dateStr := entry.Updated dateStr := entry.Updated
if dateStr == "" { if dateStr == "" {
dateStr = entry.Published dateStr = entry.Published
} }
if dateStr != "" { if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil { if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t) dates = append(dates, t)
} }
} }
items = append(items, item)
} }
// Calculate date stats
if len(dates) > 0 { if len(dates) > 0 {
oldest, newest := dates[0], dates[0] oldest, newest := dates[0], dates[0]
for _, d := range dates { for _, d := range dates {
@@ -163,6 +254,8 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
} }
} }
return items
} }
// parseRSSDate attempts to parse various RSS date formats // parseRSSDate attempts to parse various RSS date formats
+55
View File
@@ -0,0 +1,55 @@
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
background: #0a0a0a;
color: #ffffff;
padding: 20px;
line-height: 1.6;
}
h1 { color: #ffffff; margin-bottom: 20px; font-size: 24px; }
h2 { color: #ffffff; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
.card {
background: #151515;
border: 1px solid #252525;
border-radius: 8px;
padding: 15px;
}
.stat-value { font-size: 32px; font-weight: bold; color: #ffffff; }
.stat-label { font-size: 12px; color: #ffffff; text-transform: uppercase; }
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; color: #ffffff; }
.stat-row:last-child { border-bottom: none; }
.progress-bar {
background: #202020;
border-radius: 4px;
height: 8px;
margin-top: 10px;
overflow: hidden;
}
.progress-fill {
background: linear-gradient(90deg, #00aa55, #00cc66);
height: 100%;
transition: width 0.3s;
}
table { width: 100%; border-collapse: collapse; color: #ffffff; }
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
th { color: #ffffff; font-size: 11px; text-transform: uppercase; }
td { font-size: 13px; color: #ffffff; }
.type-rss { color: #f90; }
.type-atom { color: #09f; }
.type-unknown { color: #ffffff; }
.url {
max-width: 400px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
color: #4a9eff;
}
.time { color: #ffffff; font-size: 12px; }
.updated { color: #ffffff; font-size: 11px; text-align: right; margin-top: 20px; }
/* Search */
#searchInput:focus { outline: none; border-color: #0af; }
#searchInput::placeholder { color: #555; }
.search-host { margin-bottom: 10px; }
.search-feed:hover { background: #1a1a1a; }
+519
View File
@@ -0,0 +1,519 @@
function initDashboard() {
function commaFormat(n) {
return n.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
}
function escapeHtml(text) {
if (text == null) return '';
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// All domains state
let allDomainsOffset = 0;
let allDomainsLoading = false;
let allDomainsEnd = false;
let expandedDomain = null;
let expandedFeed = null;
const PAGE_SIZE = 100;
const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom
// Search state
let searchTimeout = null;
let isSearching = false;
async function loadMoreDomains() {
if (allDomainsLoading || allDomainsEnd) return;
allDomainsLoading = true;
const loadingEl = document.getElementById('allDomainsLoading');
loadingEl.style.display = 'block';
try {
const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE);
const domains = await response.json();
if (!domains || domains.length === 0) {
allDomainsEnd = true;
loadingEl.style.display = 'none';
return;
}
const container = document.getElementById('allDomains');
domains.forEach(d => {
const row = document.createElement('div');
row.className = 'domain-row';
row.innerHTML =
'<div class="stat-row" style="cursor: pointer;">' +
'<span>' + escapeHtml(d.host) + '</span>' +
'<span>' + commaFormat(d.feeds_found) + '</span>' +
'</div>' +
'<div class="domain-feeds" style="display: none;"></div>';
row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row));
container.appendChild(row);
});
allDomainsOffset += domains.length;
loadingEl.style.display = 'none';
// If we got fewer than PAGE_SIZE, we've reached the end
if (domains.length < PAGE_SIZE) {
allDomainsEnd = true;
}
} catch (err) {
console.error('Failed to load domains:', err);
} finally {
allDomainsLoading = false;
}
}
async function toggleDomainFeeds(host, rowEl) {
const feedsDiv = rowEl.querySelector('.domain-feeds');
// Close previously expanded domain
if (expandedDomain && expandedDomain !== rowEl) {
expandedDomain.querySelector('.domain-feeds').style.display = 'none';
}
// Toggle current
if (feedsDiv.style.display === 'none') {
feedsDiv.style.display = 'block';
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feeds...</div>';
expandedDomain = rowEl;
try {
const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host));
const feeds = await response.json();
if (!feeds || feeds.length === 0) {
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">No feeds found</div>';
} else {
feedsDiv.innerHTML = '';
feeds.forEach(f => {
const feedItem = document.createElement('div');
feedItem.className = 'feed-item';
feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;';
feedItem.innerHTML =
'<div class="feed-header">' +
'<div style="color: #0af;">' + escapeHtml(f.url) + '</div>' +
(f.title ? '<div style="color: #888; font-size: 0.9em;">' + escapeHtml(f.title) + '</div>' : '') +
'<div style="color: #666; font-size: 0.8em;">' + (f.type || 'unknown') + '</div>' +
'</div>' +
'<div class="feed-details" style="display: none;"></div>';
feedItem.querySelector('.feed-header').addEventListener('click', (e) => {
e.stopPropagation();
toggleFeedInfo(f.url, feedItem);
});
feedsDiv.appendChild(feedItem);
});
}
} catch (err) {
feedsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feeds</div>';
}
} else {
feedsDiv.style.display = 'none';
expandedDomain = null;
}
}
async function toggleFeedInfo(feedUrl, feedItemEl) {
const detailsDiv = feedItemEl.querySelector('.feed-details');
// Close previously expanded feed
if (expandedFeed && expandedFeed !== feedItemEl) {
expandedFeed.querySelector('.feed-details').style.display = 'none';
}
// Toggle current
if (detailsDiv.style.display === 'none') {
detailsDiv.style.display = 'block';
detailsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feed info...</div>';
expandedFeed = feedItemEl;
// Scroll the feed item to the top of the viewport
feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' });
try {
// Fetch feed info and items in parallel
const [infoResponse, itemsResponse] = await Promise.all([
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50')
]);
const info = await infoResponse.json();
const items = await itemsResponse.json();
let html = '<div style="padding: 10px; background: #1a1a1a; margin-top: 5px; border-radius: 4px; font-size: 0.85em;">';
if (info.description) {
html += '<div style="margin-bottom: 8px; color: #aaa;">' + escapeHtml(info.description) + '</div>';
}
html += '<table style="width: 100%; color: #888;">';
if (info.siteUrl) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
}
if (info.language) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
}
if (info.status) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
}
if (info.itemCount) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
}
if (info.avgPostFreqHrs) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Avg Post Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
}
if (info.ttlMinutes) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">TTL</td><td>' + info.ttlMinutes + ' min</td></tr>';
}
if (info.updatePeriod) {
let updateStr = info.updatePeriod;
if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')';
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Update</td><td>' + escapeHtml(updateStr) + '</td></tr>';
}
if (info.lastBuildDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Build</td><td>' + escapeHtml(info.lastBuildDate) + '</td></tr>';
}
if (info.newestItemDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Newest Item</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
}
if (info.oldestItemDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Oldest Item</td><td>' + escapeHtml(info.oldestItemDate) + '</td></tr>';
}
if (info.discoveredAt) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Discovered</td><td>' + escapeHtml(info.discoveredAt) + '</td></tr>';
}
if (info.lastCrawledAt) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Crawled</td><td>' + escapeHtml(info.lastCrawledAt) + '</td></tr>';
}
if (info.errorCount > 0) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Errors</td><td style="color: #f66;">' + info.errorCount + '</td></tr>';
}
if (info.lastError) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Error</td><td style="color: #f66;">' + escapeHtml(info.lastError) + '</td></tr>';
}
html += '</table>';
// Display items
if (items && items.length > 0) {
html += '<div style="margin-top: 12px; border-top: 1px solid #333; padding-top: 8px;">';
html += '<div style="color: #666; margin-bottom: 6px; font-weight: bold;">Recent Items (' + items.length + ')</div>';
items.forEach(item => {
html += '<div style="padding: 6px 0; border-bottom: 1px solid #222;">';
// Title with link
if (item.title) {
if (item.link) {
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.title) + '</a></div>';
} else {
html += '<div style="color: #ccc;">' + escapeHtml(item.title) + '</div>';
}
} else if (item.link) {
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.link) + '</a></div>';
}
// Metadata line (date, author)
let meta = [];
if (item.pub_date) {
const date = new Date(item.pub_date);
meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString());
}
if (item.author) {
meta.push(escapeHtml(item.author));
}
if (meta.length > 0) {
html += '<div style="color: #666; font-size: 0.85em;">' + meta.join(' • ') + '</div>';
}
html += '</div>';
});
html += '</div>';
}
html += '</div>';
detailsDiv.innerHTML = html;
} catch (err) {
detailsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feed info</div>';
}
} else {
detailsDiv.style.display = 'none';
expandedFeed = null;
}
}
// Infinite scroll handler with prefetch (uses window scroll)
function setupInfiniteScroll() {
window.addEventListener('scroll', () => {
// Check if we're near the bottom of the page
const scrollBottom = window.scrollY + window.innerHeight;
const docHeight = document.documentElement.scrollHeight;
const remainingPixels = docHeight - scrollBottom;
// Prefetch when within 500px of the bottom
if (remainingPixels < 500) {
loadMoreDomains();
}
});
}
// Search functionality
function setupSearch() {
const searchInput = document.getElementById('searchInput');
const searchResults = document.getElementById('searchResults');
const domainsContainer = document.getElementById('allDomainsContainer');
if (!searchInput || !searchResults || !domainsContainer) {
console.error('Search elements not found');
return;
}
searchInput.addEventListener('input', (e) => {
const query = e.target.value.trim();
// Clear previous timeout
if (searchTimeout) {
clearTimeout(searchTimeout);
}
// If empty, show domains list
if (!query) {
searchResults.style.display = 'none';
domainsContainer.style.display = 'block';
isSearching = false;
return;
}
// Debounce search
searchTimeout = setTimeout(() => performSearch(query), 300);
});
// Handle Enter key
searchInput.addEventListener('keydown', (e) => {
if (e.key === 'Enter') {
const query = e.target.value.trim();
if (query) {
if (searchTimeout) clearTimeout(searchTimeout);
performSearch(query);
}
}
});
}
async function performSearch(query) {
const searchResults = document.getElementById('searchResults');
const domainsContainer = document.getElementById('allDomainsContainer');
isSearching = true;
domainsContainer.style.display = 'none';
searchResults.style.display = 'block';
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">Searching...</div>';
try {
const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200');
const results = await response.json();
if (!results || results.length === 0) {
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">No results found</div>';
return;
}
// Group results by host
const byHost = {};
results.forEach(r => {
const host = r.feed.source_host || 'unknown';
if (!byHost[host]) {
byHost[host] = [];
}
byHost[host].push(r);
});
// Render results
searchResults.innerHTML = '';
Object.keys(byHost).sort().forEach(host => {
const hostDiv = document.createElement('div');
hostDiv.className = 'search-host';
// Host header
const hostHeader = document.createElement('div');
hostHeader.className = 'stat-row';
hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;';
hostHeader.innerHTML = '<span style="color: #0af;">' + escapeHtml(host) + '</span><span style="color: #666;">' + byHost[host].length + ' feed(s)</span>';
const feedsContainer = document.createElement('div');
feedsContainer.style.display = 'block';
byHost[host].forEach(result => {
const feedDiv = document.createElement('div');
feedDiv.className = 'search-feed';
feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;';
// Feed header
let feedHtml = '<div style="color: #0af; cursor: pointer;" class="feed-url">' + escapeHtml(result.feed.url) + '</div>';
if (result.feed.title) {
feedHtml += '<div style="color: #aaa; font-size: 0.9em;">' + escapeHtml(result.feed.title) + '</div>';
}
if (result.feed.description) {
feedHtml += '<div style="color: #666; font-size: 0.85em; margin-top: 2px;">' + escapeHtml(result.feed.description.substring(0, 200)) + '</div>';
}
// Items
if (result.items && result.items.length > 0) {
feedHtml += '<div class="search-items" style="margin-top: 8px; padding-left: 10px; border-left: 2px solid #333;">';
result.items.forEach(item => {
feedHtml += '<div style="padding: 4px 0; border-bottom: 1px solid #1a1a1a;">';
if (item.title) {
if (item.link) {
feedHtml += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #6cf; text-decoration: none;">' + escapeHtml(item.title) + '</a>';
} else {
feedHtml += '<span style="color: #ccc;">' + escapeHtml(item.title) + '</span>';
}
}
let meta = [];
if (item.pub_date) {
meta.push(item.pub_date.substring(0, 10));
}
if (item.author) {
meta.push(escapeHtml(item.author));
}
if (meta.length > 0) {
feedHtml += '<div style="color: #555; font-size: 0.8em;">' + meta.join(' • ') + '</div>';
}
feedHtml += '</div>';
});
feedHtml += '</div>';
}
feedDiv.innerHTML = feedHtml;
// Click on feed URL to toggle full feed info
feedDiv.querySelector('.feed-url').addEventListener('click', () => {
toggleSearchFeedInfo(result.feed.url, feedDiv);
});
feedsContainer.appendChild(feedDiv);
});
hostHeader.addEventListener('click', () => {
feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none';
});
hostDiv.appendChild(hostHeader);
hostDiv.appendChild(feedsContainer);
searchResults.appendChild(hostDiv);
});
} catch (err) {
console.error('Search failed:', err);
searchResults.innerHTML = '<div style="padding: 20px; color: #f66; text-align: center;">Search failed: ' + escapeHtml(err.message) + '</div>';
}
}
async function toggleSearchFeedInfo(feedUrl, feedDiv) {
let detailsDiv = feedDiv.querySelector('.feed-details-expanded');
if (detailsDiv) {
detailsDiv.remove();
return;
}
detailsDiv = document.createElement('div');
detailsDiv.className = 'feed-details-expanded';
detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;';
detailsDiv.innerHTML = '<div style="color: #666;">Loading feed info...</div>';
feedDiv.appendChild(detailsDiv);
try {
const [infoResponse, itemsResponse] = await Promise.all([
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20')
]);
const info = await infoResponse.json();
const items = await itemsResponse.json();
let html = '<table style="width: 100%; color: #888; font-size: 0.85em;">';
if (info.siteUrl) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
if (info.language) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
if (info.status) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
if (info.itemCount) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
if (info.avgPostFreqHrs) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Avg Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
if (info.newestItemDate) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Newest</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
html += '</table>';
if (items && items.length > 0) {
html += '<div style="margin-top: 10px; border-top: 1px solid #222; padding-top: 8px;">';
html += '<div style="color: #555; margin-bottom: 4px;">All Items (' + items.length + ')</div>';
items.forEach(item => {
html += '<div style="padding: 3px 0; border-bottom: 1px solid #1a1a1a;">';
if (item.title && item.link) {
html += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none; font-size: 0.9em;">' + escapeHtml(item.title) + '</a>';
} else if (item.title) {
html += '<span style="color: #aaa; font-size: 0.9em;">' + escapeHtml(item.title) + '</span>';
}
html += '</div>';
});
html += '</div>';
}
detailsDiv.innerHTML = html;
} catch (err) {
detailsDiv.innerHTML = '<div style="color: #f66;">Failed to load feed info</div>';
}
}
async function updateStats() {
try {
const response = await fetch('/api/stats');
const stats = await response.json();
// Update domain stats
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains);
document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains);
document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate);
document.getElementById('checkRate').textContent = commaFormat(stats.check_rate);
// Update progress bar
const progress = stats.total_domains > 0
? (stats.checked_domains * 100 / stats.total_domains).toFixed(1)
: 0;
document.getElementById('crawlProgress').style.width = progress + '%';
// Update feed stats
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
// Update timestamp
const updatedAt = new Date(stats.updated_at);
document.getElementById('updatedAt').textContent = 'Last updated: ' +
updatedAt.toISOString().replace('T', ' ').substring(0, 19);
} catch (err) {
console.error('Failed to update stats:', err);
}
}
// Initialize
try {
setupSearch();
} catch (e) {
console.error('setupSearch failed:', e);
}
setupInfiniteScroll();
loadMoreDomains();
updateStats();
setInterval(updateStats, 1000);
}
window.onload = initDashboard;