Add Docker support and refactor data layer

This commit is contained in:
primal
2026-01-26 16:02:05 -05:00
parent 398e7b3969
commit 143807378f
12 changed files with 2642 additions and 518 deletions
+8
View File
@@ -0,0 +1,8 @@
1440.news
1440.db
feeds/
*.gz
.git
.gitignore
.claude
CLAUDE.md
+1
View File
@@ -3,3 +3,4 @@ go.*
*.gz
feeds/
feeds.db/
1440.db
+37
View File
@@ -0,0 +1,37 @@
FROM golang:1.24-alpine AS builder
WORKDIR /app
# Install build dependencies
RUN apk add --no-cache gcc musl-dev
# Copy go mod files first for layer caching
COPY go.mod go.sum ./
RUN go mod download
# Copy source code
COPY *.go ./
COPY static/ ./static/
# Build the binary
RUN CGO_ENABLED=1 go build -o 1440.news .
# Runtime stage
FROM alpine:latest
WORKDIR /app
# Install runtime dependencies
RUN apk add --no-cache ca-certificates tzdata
# Copy binary from builder
COPY --from=builder /app/1440.news .
COPY --from=builder /app/static ./static
# Create feeds directory
RUN mkdir -p feeds
# Expose dashboard port
EXPOSE 4321
CMD ["./1440.news"]
+106 -65
View File
@@ -1,9 +1,9 @@
package main
import (
"database/sql"
"fmt"
"io"
"math/rand"
"net/http"
"runtime"
"strings"
@@ -11,26 +11,33 @@ import (
"sync/atomic"
"time"
"github.com/cockroachdb/pebble"
"golang.org/x/net/html"
)
type Crawler struct {
MaxDepth int
MaxPagesPerHost int
Timeout time.Duration
UserAgent string
visited sync.Map
feedsMu sync.Mutex
client *http.Client
hostsProcessed int32
db *pebble.DB
MaxDepth int
MaxPagesPerHost int
Timeout time.Duration
UserAgent string
visited sync.Map
feedsMu sync.Mutex
client *http.Client
hostsProcessed int32
feedsChecked int32
startTime time.Time
db *sql.DB
displayedCrawlRate int
displayedCheckRate int
domainsImported int32
cachedStats *DashboardStats
cachedAllDomains []DomainStat
statsMu sync.RWMutex
}
func NewCrawler(dbPath string) (*Crawler, error) {
db, err := pebble.Open(dbPath, &pebble.Options{})
db, err := OpenDatabase(dbPath)
if err != nil {
return nil, fmt.Errorf("failed to open pebble db: %v", err)
return nil, fmt.Errorf("failed to open database: %v", err)
}
return &Crawler{
@@ -38,6 +45,7 @@ func NewCrawler(dbPath string) (*Crawler, error) {
MaxPagesPerHost: 10,
Timeout: 10 * time.Second,
UserAgent: "FeedCrawler/1.0",
startTime: time.Now(),
db: db,
client: &http.Client{
Timeout: 10 * time.Second,
@@ -58,87 +66,121 @@ func (c *Crawler) Close() error {
return nil
}
// CrawlUncrawledDomains fetches uncrawled domains and crawls them
func (c *Crawler) CrawlUncrawledDomains() error {
domains, err := c.GetUncrawledDomains()
if err != nil {
return fmt.Errorf("failed to get uncrawled domains: %v", err)
// StartStatsLoop updates cached stats once per minute
func (c *Crawler) StartStatsLoop() {
for {
c.UpdateStats()
time.Sleep(1 * time.Minute)
}
}
if len(domains) == 0 {
return nil
// StartCleanupLoop runs item cleanup once per week
func (c *Crawler) StartCleanupLoop() {
for {
deleted, err := c.CleanupOldItems()
if err != nil {
fmt.Printf("Cleanup error: %v\n", err)
} else if deleted > 0 {
fmt.Printf("Cleanup: removed %d old items\n", deleted)
}
time.Sleep(7 * 24 * time.Hour)
}
}
// Shuffle for randomized crawling
rand.Shuffle(len(domains), func(i, j int) {
domains[i], domains[j] = domains[j], domains[i]
})
numWorkers := runtime.NumCPU() - 1
// StartCrawlLoop runs the domain crawling loop independently
func (c *Crawler) StartCrawlLoop() {
numWorkers := runtime.NumCPU()
if numWorkers < 1 {
numWorkers = 1
}
type crawlResult struct {
host string
feedsFound int
lastError string
}
domainChan := make(chan *Domain, numWorkers*2)
resultChan := make(chan crawlResult, numWorkers*2)
var wg sync.WaitGroup
// Buffered channel for domain work
workChan := make(chan *Domain, 256)
// Start workers
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for domain := range domainChan {
for domain := range workChan {
feedsFound, crawlErr := c.crawlHost(domain.Host)
errStr := ""
if crawlErr != nil {
errStr = crawlErr.Error()
}
resultChan <- crawlResult{
host: domain.Host,
feedsFound: feedsFound,
lastError: errStr,
if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil {
fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err)
}
}
}()
}
// Start result processor
done := make(chan bool)
go func() {
for result := range resultChan {
if err := c.markDomainCrawled(result.host, result.feedsFound, result.lastError); err != nil {
fmt.Printf("Error marking domain %s as crawled: %v\n", result.host, err)
}
const fetchSize = 100
for {
domains, err := c.GetUncheckedDomainsRandom(fetchSize)
if err != nil {
fmt.Printf("Error fetching domains: %v\n", err)
}
done <- true
}()
// Send domains to workers
for _, domain := range domains {
domainChan <- domain
if len(domains) == 0 {
c.displayedCrawlRate = 0
time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains))
for _, domain := range domains {
workChan <- domain
}
time.Sleep(1 * time.Second)
}
}
// StartCheckLoop runs the feed checking loop independently
func (c *Crawler) StartCheckLoop() {
numWorkers := runtime.NumCPU()
if numWorkers < 1 {
numWorkers = 1
}
close(domainChan)
wg.Wait()
close(resultChan)
<-done
// Buffered channel for feed work
workChan := make(chan *Feed, 256)
return nil
// Start workers
for i := 0; i < numWorkers; i++ {
go func() {
for feed := range workChan {
c.CheckFeed(feed)
}
}()
}
const fetchSize = 100
for {
feeds, err := c.GetFeedsDueForCheck(fetchSize)
if err != nil {
fmt.Printf("Error fetching feeds: %v\n", err)
}
if len(feeds) == 0 {
c.displayedCheckRate = 0
time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds))
for _, feed := range feeds {
workChan <- feed
}
time.Sleep(1 * time.Second)
}
}
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
atomic.AddInt32(&c.hostsProcessed, 1)
// Count feeds before crawling
initialCount, _ := c.GetFeedCount()
localVisited := make(map[string]bool)
pagesVisited := 0
@@ -148,9 +190,8 @@ func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited)
}
// Count feeds after crawling
finalCount, _ := c.GetFeedCount()
feedsFound = finalCount - initialCount
// Count feeds found for this specific host
feedsFound, _ = c.GetFeedCountByHost(host)
if pagesVisited == 0 {
return feedsFound, fmt.Errorf("could not connect")
+559 -239
View File
@@ -1,21 +1,20 @@
package main
import (
"database/sql"
"encoding/json"
"fmt"
"html/template"
"net/http"
"sort"
"time"
)
// DashboardStats holds all statistics for the dashboard
type DashboardStats struct {
// Domain stats
TotalDomains int `json:"total_domains"`
CrawledDomains int `json:"crawled_domains"`
UncrawledDomains int `json:"uncrawled_domains"`
ErrorDomains int `json:"error_domains"`
TotalDomains int `json:"total_domains"`
CheckedDomains int `json:"checked_domains"`
UncheckedDomains int `json:"unchecked_domains"`
// Feed stats
TotalFeeds int `json:"total_feeds"`
@@ -25,16 +24,8 @@ type DashboardStats struct {
// Crawl progress
HostsProcessed int32 `json:"hosts_processed"`
CrawlRate float64 `json:"crawl_rate"` // domains per minute
// Top TLDs by feed count
TopTLDs []TLDStat `json:"top_tlds"`
// Recent feeds
RecentFeeds []RecentFeed `json:"recent_feeds"`
// Top domains by feed count
TopDomains []DomainStat `json:"top_domains"`
CrawlRate int `json:"crawl_rate"` // crawls per minute
CheckRate int `json:"check_rate"` // feed checks per minute
// Timing
UpdatedAt time.Time `json:"updated_at"`
@@ -57,13 +48,107 @@ type DomainStat struct {
FeedsFound int `json:"feeds_found"`
}
// GetDashboardStats collects all statistics for the dashboard
// commaFormat formats an integer with comma separators
func commaFormat(n int) string {
s := fmt.Sprintf("%d", n)
if len(s) <= 3 {
return s
}
var result []byte
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
result = append(result, ',')
}
result = append(result, byte(c))
}
return string(result)
}
// UpdateStats recalculates and caches dashboard statistics
func (c *Crawler) UpdateStats() {
fmt.Println("UpdateStats: calculating stats...")
stats, err := c.calculateStats()
if err != nil {
fmt.Printf("UpdateStats: error calculating stats: %v\n", err)
return
}
// Cache all domains with feeds (runs in background, so slow query is OK)
fmt.Println("UpdateStats: fetching all domains...")
allDomains := c.fetchAllDomainsFromDB()
fmt.Printf("UpdateStats: got %d domains\n", len(allDomains))
c.statsMu.Lock()
c.cachedStats = stats
c.cachedAllDomains = allDomains
c.statsMu.Unlock()
fmt.Println("UpdateStats: complete")
}
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
rows, err := c.db.Query(`
SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds
GROUP BY tld, sourceHost
ORDER BY tld, sourceHost
`)
if err != nil {
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
return nil
}
defer rows.Close()
var domains []DomainStat
for rows.Next() {
var ds DomainStat
var tld string
if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil {
continue
}
domains = append(domains, ds)
}
return domains
}
// GetDashboardStats returns cached statistics (returns empty stats if not yet cached)
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
c.statsMu.RLock()
stats := c.cachedStats
c.statsMu.RUnlock()
if stats != nil {
return stats, nil
}
// Return empty stats while background calculation runs (don't block HTTP requests)
return &DashboardStats{UpdatedAt: time.Now()}, nil
}
// calculateStats collects all statistics for the dashboard
func (c *Crawler) calculateStats() (*DashboardStats, error) {
stats := &DashboardStats{
UpdatedAt: time.Now(),
UpdatedAt: time.Now(),
HostsProcessed: c.hostsProcessed,
}
// Calculate crawl rate (crawls per minute), smoothed by +/-1 per update
elapsed := time.Since(c.startTime).Minutes()
if elapsed > 0 {
actualRate := int(float64(c.hostsProcessed) / elapsed)
if actualRate > c.displayedCrawlRate {
c.displayedCrawlRate++
} else if actualRate < c.displayedCrawlRate {
c.displayedCrawlRate--
}
stats.CrawlRate = c.displayedCrawlRate
// Calculate check rate (feed checks per minute), smoothed by +/-1 per update
actualCheckRate := int(float64(c.feedsChecked) / elapsed)
if actualCheckRate > c.displayedCheckRate {
c.displayedCheckRate++
} else if actualCheckRate < c.displayedCheckRate {
c.displayedCheckRate--
}
stats.CheckRate = c.displayedCheckRate
}
// Get domain stats
if err := c.collectDomainStats(stats); err != nil {
return nil, err
@@ -78,148 +163,455 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
}
func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
iter, err := c.db.NewIter(nil)
// Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains)
if err != nil {
return err
}
defer iter.Close()
domainFeeds := make(map[string]int)
// Single query to get all status counts (one index scan instead of three)
rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status")
if err != nil {
return err
}
defer rows.Close()
for iter.SeekGE([]byte("domain:")); iter.Valid(); iter.Next() {
key := string(iter.Key())
if len(key) < 7 || key[:7] != "domain:" {
break
}
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
stats.TotalDomains++
switch domain.Status {
case "crawled":
stats.CrawledDomains++
if domain.FeedsFound > 0 {
domainFeeds[domain.Host] = domain.FeedsFound
}
case "uncrawled":
stats.UncrawledDomains++
case "error":
stats.ErrorDomains++
switch status {
case "checked":
stats.CheckedDomains = count
case "unchecked":
stats.UncheckedDomains = count
}
}
// Top domains by feed count
type kv struct {
Host string
Count int
}
var sorted []kv
for h, c := range domainFeeds {
sorted = append(sorted, kv{h, c})
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Count > sorted[j].Count
})
for i := 0; i < len(sorted) && i < 10; i++ {
stats.TopDomains = append(stats.TopDomains, DomainStat{
Host: sorted[i].Host,
FeedsFound: sorted[i].Count,
})
if err := rows.Err(); err != nil {
return err
}
return iter.Error()
return rows.Err()
}
func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
iter, err := c.db.NewIter(nil)
// Use MAX(rowid) for fast approximate total count
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds)
if err != nil {
return err
}
defer iter.Close()
tldCounts := make(map[string]int)
var recentFeeds []RecentFeed
// Single query to get all type counts (one index scan instead of three)
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
if err != nil {
return err
}
defer rows.Close()
for iter.SeekGE([]byte("feed:")); iter.Valid(); iter.Next() {
key := string(iter.Key())
if len(key) < 5 || key[:5] != "feed:" {
break
}
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
for rows.Next() {
var feedType sql.NullString
var count int
if err := rows.Scan(&feedType, &count); err != nil {
continue
}
stats.TotalFeeds++
switch feed.Type {
switch feedType.String {
case "rss":
stats.RSSFeeds++
stats.RSSFeeds = count
case "atom":
stats.AtomFeeds++
stats.AtomFeeds = count
default:
stats.UnknownFeeds++
stats.UnknownFeeds += count
}
if feed.TLD != "" {
tldCounts[feed.TLD]++
}
recentFeeds = append(recentFeeds, RecentFeed{
URL: feed.URL,
Title: feed.Title,
Type: feed.Type,
DiscoveredAt: feed.DiscoveredAt,
})
}
// Top TLDs
type kv struct {
TLD string
Count int
}
var sortedTLDs []kv
for t, c := range tldCounts {
sortedTLDs = append(sortedTLDs, kv{t, c})
}
sort.Slice(sortedTLDs, func(i, j int) bool {
return sortedTLDs[i].Count > sortedTLDs[j].Count
})
for i := 0; i < len(sortedTLDs) && i < 10; i++ {
stats.TopTLDs = append(stats.TopTLDs, TLDStat{
TLD: sortedTLDs[i].TLD,
Count: sortedTLDs[i].Count,
})
}
// Recent feeds (last 20, sorted by discovery time)
sort.Slice(recentFeeds, func(i, j int) bool {
return recentFeeds[i].DiscoveredAt.After(recentFeeds[j].DiscoveredAt)
})
if len(recentFeeds) > 20 {
recentFeeds = recentFeeds[:20]
}
stats.RecentFeeds = recentFeeds
return iter.Error()
return rows.Err()
}
// StartDashboard starts the web dashboard server
func (c *Crawler) StartDashboard(addr string) error {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
c.handleDashboard(w, r)
})
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIStats(w, r)
})
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIAllDomains(w, r)
})
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomainFeeds(w, r)
})
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedInfo(w, r)
})
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedItems(w, r)
})
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISearch(w, r)
})
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
})
fmt.Printf("Dashboard running at http://%s\n", addr)
return http.ListenAndServe(addr, nil)
}
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
rows, err := c.db.Query(`
SELECT url, title, type FROM feeds
WHERE sourceHost = ?
ORDER BY url ASC
LIMIT 1000
`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title sql.NullString
if err := rows.Scan(&f.URL, &title, &f.Type); err != nil {
continue
}
if title.Valid {
f.Title = title.String
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
type FeedDetails struct {
URL string `json:"url"`
Type string `json:"type,omitempty"`
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"siteUrl,omitempty"`
DiscoveredAt string `json:"discoveredAt,omitempty"`
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
LastBuildDate string `json:"lastBuildDate,omitempty"`
TTLMinutes int `json:"ttlMinutes,omitempty"`
UpdatePeriod string `json:"updatePeriod,omitempty"`
UpdateFreq int `json:"updateFreq,omitempty"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"errorCount,omitempty"`
LastError string `json:"lastError,omitempty"`
ItemCount int `json:"itemCount,omitempty"`
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
OldestItemDate string `json:"oldestItemDate,omitempty"`
NewestItemDate string `json:"newestItemDate,omitempty"`
}
var f FeedDetails
var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString
var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString
var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, lastBuildDate,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate
FROM feeds WHERE url = ?
`, feedURL).Scan(
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
&f.DiscoveredAt, &lastCrawledAt, &lastBuildDate,
&ttlMinutes, &updatePeriod, &updateFreq,
&status, &errorCount, &lastError,
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
)
if err == sql.ErrNoRows {
http.Error(w, "feed not found", http.StatusNotFound)
return
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if title.Valid {
f.Title = title.String
}
if description.Valid {
f.Description = description.String
}
if language.Valid {
f.Language = language.String
}
if siteUrl.Valid {
f.SiteURL = siteUrl.String
}
if lastCrawledAt.Valid {
f.LastCrawledAt = lastCrawledAt.String
}
if lastBuildDate.Valid {
f.LastBuildDate = lastBuildDate.String
}
if ttlMinutes.Valid {
f.TTLMinutes = int(ttlMinutes.Int64)
}
if updatePeriod.Valid {
f.UpdatePeriod = updatePeriod.String
}
if updateFreq.Valid {
f.UpdateFreq = int(updateFreq.Int64)
}
if status.Valid {
f.Status = status.String
}
if errorCount.Valid {
f.ErrorCount = int(errorCount.Int64)
}
if lastError.Valid {
f.LastError = lastError.String
}
if itemCount.Valid {
f.ItemCount = int(itemCount.Int64)
}
if avgPostFreqHrs.Valid {
f.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
f.OldestItemDate = oldestItemDate.String
}
if newestItemDate.Valid {
f.NewestItemDate = newestItemDate.String
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(f)
}
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
limit := 50
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
items, err := c.GetItemsByFeed(feedURL, limit)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if items == nil {
items = []*Item{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(items)
}
// SearchResult represents a search result with feed and matching items
type SearchResult struct {
Feed SearchFeed `json:"feed"`
Items []SearchItem `json:"items"`
}
type SearchFeed struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
Type string `json:"type"`
SourceHost string `json:"source_host"`
Status string `json:"status"`
}
type SearchItem struct {
ID int64 `json:"id"`
Title string `json:"title"`
Link string `json:"link"`
Description string `json:"description"`
Author string `json:"author"`
PubDate string `json:"pub_date"`
}
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
if query == "" {
http.Error(w, "q parameter required", http.StatusBadRequest)
return
}
limit := 100
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
// Results map: feedURL -> SearchResult
results := make(map[string]*SearchResult)
// Search feeds
feedRows, err := c.db.Query(`
SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
LIMIT ?
`, query, limit)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
var url string
var title, description, feedType, sourceHost, status sql.NullString
if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil {
continue
}
results[url] = &SearchResult{
Feed: SearchFeed{
URL: url,
Title: title.String,
Description: description.String,
Type: feedType.String,
SourceHost: sourceHost.String,
Status: status.String,
},
Items: []SearchItem{},
}
}
}
// Search items
itemRows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err == nil {
defer itemRows.Close()
for itemRows.Next() {
var id int64
var feedUrl string
var title, link, description, author, pubDate sql.NullString
if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil {
continue
}
item := SearchItem{
ID: id,
Title: title.String,
Link: link.String,
Description: description.String,
Author: author.String,
PubDate: pubDate.String,
}
// Add to existing result or create new one
if result, exists := results[feedUrl]; exists {
result.Items = append(result.Items, item)
} else {
// Fetch feed info for this item's feed
var fTitle, fDesc, fType, fHost, fStatus sql.NullString
c.db.QueryRow(`
SELECT title, description, type, sourceHost, status
FROM feeds WHERE url = ?
`, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus)
results[feedUrl] = &SearchResult{
Feed: SearchFeed{
URL: feedUrl,
Title: fTitle.String,
Description: fDesc.String,
Type: fType.String,
SourceHost: fHost.String,
Status: fStatus.String,
},
Items: []SearchItem{item},
}
}
}
}
// Convert map to slice
var resultList []SearchResult
for _, r := range results {
resultList = append(resultList, *r)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resultList)
}
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats()
if err != nil {
@@ -228,14 +620,28 @@ func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
}
funcMap := template.FuncMap{
"divf": func(a, b int) float64 {
"pct": func(a, b int) float64 {
if b == 0 {
return 0
}
return float64(a) / float64(b)
return float64(a) * 100.0 / float64(b)
},
"mulf": func(a int, b float64) float64 {
return float64(a) * b
"comma": func(n interface{}) string {
var val int
switch v := n.(type) {
case int:
val = v
case int32:
val = int(v)
case int64:
val = int(v)
default:
return "0"
}
if val < 0 {
return "-" + commaFormat(-val)
}
return commaFormat(val)
},
}
@@ -265,58 +671,8 @@ const dashboardHTML = `<!DOCTYPE html>
<head>
<title>1440.news Feed Crawler</title>
<meta charset="utf-8">
<meta http-equiv="refresh" content="5">
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
background: #0a0a0a;
color: #e0e0e0;
padding: 20px;
line-height: 1.6;
}
h1 { color: #fff; margin-bottom: 20px; font-size: 24px; }
h2 { color: #888; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
.card {
background: #151515;
border: 1px solid #252525;
border-radius: 8px;
padding: 15px;
}
.stat-value { font-size: 32px; font-weight: bold; color: #fff; }
.stat-label { font-size: 12px; color: #666; text-transform: uppercase; }
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; }
.stat-row:last-child { border-bottom: none; }
.progress-bar {
background: #202020;
border-radius: 4px;
height: 8px;
margin-top: 10px;
overflow: hidden;
}
.progress-fill {
background: linear-gradient(90deg, #00aa55, #00cc66);
height: 100%;
transition: width 0.3s;
}
table { width: 100%; border-collapse: collapse; }
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
th { color: #666; font-size: 11px; text-transform: uppercase; }
td { font-size: 13px; }
.type-rss { color: #f90; }
.type-atom { color: #09f; }
.type-unknown { color: #666; }
.url {
max-width: 400px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
color: #4a9eff;
}
.time { color: #666; font-size: 12px; }
.updated { color: #444; font-size: 11px; text-align: right; margin-top: 20px; }
</style>
<link rel="stylesheet" href="/static/dashboard.css">
<script src="/static/dashboard.js"></script>
</head>
<body>
<h1>1440.news Feed Crawler</h1>
@@ -324,99 +680,63 @@ const dashboardHTML = `<!DOCTYPE html>
<h2>Crawl Progress</h2>
<div class="grid">
<div class="card">
<div class="stat-value">{{.TotalDomains}}</div>
<div class="stat-label">Total Domains</div>
<div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
<div class="stat-label">Domains</div>
</div>
<div class="card">
<div class="stat-value">{{.CrawledDomains}}</div>
<div class="stat-label">Crawled</div>
{{if .TotalDomains}}
<div class="stat-value" id="checkedDomains">{{comma .CheckedDomains}}</div>
<div class="stat-label">Checked</div>
<div class="progress-bar">
<div class="progress-fill" style="width: {{printf "%.1f" (divf (mulf .CrawledDomains 100.0) .TotalDomains)}}%"></div>
<div class="progress-fill" id="crawlProgress" style="width: {{printf "%.1f" (pct .CheckedDomains .TotalDomains)}}%"></div>
</div>
{{end}}
</div>
<div class="card">
<div class="stat-value">{{.UncrawledDomains}}</div>
<div class="stat-label">Uncrawled</div>
<div class="stat-value" id="uncheckedDomains">{{comma .UncheckedDomains}}</div>
<div class="stat-label">Unchecked</div>
</div>
<div class="card">
<div class="stat-value">{{.ErrorDomains}}</div>
<div class="stat-label">Errors</div>
<div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
<div class="stat-label">crawls per min</div>
</div>
<div class="card">
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
<div class="stat-label">checks per min</div>
</div>
</div>
<h2>Feeds Discovered</h2>
<div class="grid">
<div class="card">
<div class="stat-value">{{.TotalFeeds}}</div>
<div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
<div class="stat-label">Total Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #f90">{{.RSSFeeds}}</div>
<div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
<div class="stat-label">RSS Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #09f">{{.AtomFeeds}}</div>
<div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
<div class="stat-label">Atom Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #666">{{.UnknownFeeds}}</div>
<div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
<div class="stat-label">Unknown Type</div>
</div>
</div>
<div class="grid" style="grid-template-columns: 1fr 1fr;">
<div class="card">
<h2 style="margin-top: 0;">Top TLDs</h2>
{{range .TopTLDs}}
<div class="stat-row">
<span>.{{.TLD}}</span>
<span>{{.Count}}</span>
</div>
{{else}}
<div style="color: #444;">No data yet</div>
{{end}}
</div>
<div class="card">
<h2 style="margin-top: 0;">Top Domains</h2>
{{range .TopDomains}}
<div class="stat-row">
<span>{{.Host}}</span>
<span>{{.FeedsFound}}</span>
</div>
{{else}}
<div style="color: #444;">No data yet</div>
{{end}}
</div>
</div>
<h2>Recent Feeds</h2>
<div class="card">
<table>
<thead>
<tr>
<th>URL</th>
<th>Title</th>
<th>Type</th>
<th>Discovered</th>
</tr>
</thead>
<tbody>
{{range .RecentFeeds}}
<tr>
<td class="url">{{.URL}}</td>
<td>{{if .Title}}{{.Title}}{{else}}-{{end}}</td>
<td class="type-{{.Type}}">{{.Type}}</td>
<td class="time">{{.DiscoveredAt.Format "15:04:05"}}</td>
</tr>
{{else}}
<tr><td colspan="4" style="color: #444;">No feeds discovered yet</td></tr>
{{end}}
</tbody>
</table>
<h2 style="margin-top: 0;">Feeds</h2>
<div style="margin-bottom: 15px;">
<input type="text" id="searchInput" placeholder="Search feeds and items..."
style="width: 100%; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px;">
</div>
<div id="searchResults" style="display: none;"></div>
<div id="allDomainsContainer">
<div id="allDomains"></div>
<div id="allDomainsLoading" style="text-align: center; padding: 10px; color: #666;">Loading...</div>
</div>
</div>
<div class="updated">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
<div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
</body>
</html>`
+192
View File
@@ -0,0 +1,192 @@
package main
import (
"database/sql"
"fmt"
_ "modernc.org/sqlite"
)
const schema = `
CREATE TABLE IF NOT EXISTS domains (
host TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'unchecked',
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
feedsFound INTEGER DEFAULT 0,
lastError TEXT,
tld TEXT
);
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0;
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
title TEXT,
description TEXT,
language TEXT,
siteUrl TEXT,
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
nextCrawlAt DATETIME,
lastBuildDate DATETIME,
etag TEXT,
lastModified TEXT,
ttlMinutes INTEGER,
updatePeriod TEXT,
updateFreq INTEGER,
status TEXT DEFAULT 'active',
errorCount INTEGER DEFAULT 0,
lastError TEXT,
lastErrorAt DATETIME,
sourceUrl TEXT,
sourceHost TEXT,
tld TEXT,
itemCount INTEGER,
avgPostFreqHrs REAL,
oldestItemDate DATETIME,
newestItemDate DATETIME,
noUpdate INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feedUrl TEXT NOT NULL,
guid TEXT,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
UNIQUE(feedUrl, guid)
);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
url,
title,
description,
content='feeds',
content_rowid='rowid'
);
-- Triggers to keep FTS in sync
CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
-- Full-text search for items
CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5(
title,
description,
content,
author,
content='items',
content_rowid='id'
);
-- Triggers to keep items FTS in sync
CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
END;
CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
`
func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Allow multiple readers (WAL mode supports concurrent reads)
// SQLite is single-writer, but reads can happen concurrently
db.SetMaxOpenConns(4)
// Verify connection and show journal mode
var journalMode string
if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil {
fmt.Printf(" Warning: could not query journal_mode: %v\n", err)
} else {
fmt.Printf(" Journal mode: %s\n", journalMode)
}
// Create schema
if _, err := db.Exec(schema); err != nil {
db.Close()
return nil, fmt.Errorf("failed to create schema: %v", err)
}
fmt.Println(" Schema OK")
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount)
db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount)
fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount)
fmt.Println(" Running ANALYZE...")
if _, err := db.Exec("ANALYZE"); err != nil {
fmt.Printf(" Warning: ANALYZE failed: %v\n", err)
} else {
fmt.Println(" ANALYZE complete")
}
}()
return db, nil
}
+296 -122
View File
@@ -3,20 +3,19 @@ package main
import (
"bufio"
"compress/gzip"
"encoding/json"
"database/sql"
"fmt"
"io"
"os"
"strings"
"sync/atomic"
"time"
"github.com/cockroachdb/pebble"
)
// Domain represents a host to be crawled for feeds
type Domain struct {
Host string `json:"host"` // Normalized hostname (no scheme, no www.)
Status string `json:"status"` // "uncrawled", "crawled", "error"
Host string `json:"host"`
Status string `json:"status"`
DiscoveredAt time.Time `json:"discovered_at"`
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
FeedsFound int `json:"feeds_found,omitempty"`
@@ -24,130 +23,162 @@ type Domain struct {
TLD string `json:"tld,omitempty"`
}
// saveDomain stores a domain in PebbleDB
// saveDomain stores a domain in SQLite
func (c *Crawler) saveDomain(domain *Domain) error {
data, err := json.Marshal(domain)
if err != nil {
return fmt.Errorf("failed to marshal domain: %v", err)
}
key := []byte("domain:" + domain.Host)
return c.db.Set(key, data, pebble.Sync)
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(host) DO UPDATE SET
status = excluded.status,
lastCrawledAt = excluded.lastCrawledAt,
feedsFound = excluded.feedsFound,
lastError = excluded.lastError,
tld = excluded.tld
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
return err
}
// getDomain retrieves a domain from PebbleDB
func (c *Crawler) getDomain(host string) (*Domain, error) {
key := []byte("domain:" + normalizeHost(host))
data, closer, err := c.db.Get(key)
if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err
}
defer closer.Close()
var domain Domain
if err := json.Unmarshal(data, &domain); err != nil {
return nil, fmt.Errorf("failed to unmarshal domain: %v", err)
}
return &domain, nil
// saveDomainTx stores a domain using a transaction
func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error {
_, err := tx.Exec(`
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(host) DO NOTHING
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
return err
}
// domainExists checks if a domain already exists in the database
func (c *Crawler) domainExists(host string) bool {
key := []byte("domain:" + normalizeHost(host))
_, closer, err := c.db.Get(key)
if err != nil {
return false
}
closer.Close()
return true
var exists bool
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists)
return err == nil && exists
}
// GetUncrawledDomains returns all domains with status "uncrawled"
func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) {
var domains []*Domain
// getDomain retrieves a domain from SQLite
func (c *Crawler) getDomain(host string) (*Domain, error) {
domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("domain:"),
UpperBound: []byte("domain:\xff"),
})
err := c.db.QueryRow(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE host = ?
`, normalizeHost(host)).Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue
}
if domain.Status == "uncrawled" {
domains = append(domains, &domain)
}
if lastCrawledAt.Valid {
domain.LastCrawledAt = lastCrawledAt.Time
}
if lastError.Valid {
domain.LastError = lastError.String
}
if err := iter.Error(); err != nil {
return domain, nil
}
// GetUncheckedDomains returns all domains with status "unchecked"
func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
`)
if err != nil {
return nil, err
}
defer rows.Close()
return domains, nil
return c.scanDomains(rows)
}
// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
ORDER BY RANDOM()
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return c.scanDomains(rows)
}
// scanDomains is a helper to scan multiple domain rows
func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
var domains []*Domain
for rows.Next() {
domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
if err := rows.Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
); err != nil {
continue
}
if lastCrawledAt.Valid {
domain.LastCrawledAt = lastCrawledAt.Time
}
if lastError.Valid {
domain.LastError = lastError.String
}
domains = append(domains, domain)
}
return domains, rows.Err()
}
// markDomainCrawled updates a domain's status after crawling
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
domain, err := c.getDomain(host)
if err != nil {
return err
}
if domain == nil {
return fmt.Errorf("domain not found: %s", host)
}
domain.LastCrawledAt = time.Now()
domain.FeedsFound = feedsFound
status := "checked"
if lastError != "" {
domain.Status = "error"
domain.LastError = lastError
} else {
domain.Status = "crawled"
domain.LastError = ""
status = "error"
}
return c.saveDomain(domain)
var err error
if lastError != "" {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ?
WHERE host = ?
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL
WHERE host = ?
`, status, time.Now(), feedsFound, normalizeHost(host))
}
return err
}
// GetDomainCount returns the total number of domains in the database
func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) {
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("domain:"),
UpperBound: []byte("domain:\xff"),
})
func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total)
if err != nil {
return 0, 0, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
total++
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue
}
if domain.Status == "uncrawled" {
uncrawled++
}
}
if err := iter.Error(); err != nil {
return 0, 0, err
}
return total, uncrawled, nil
err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked)
return total, unchecked, err
}
// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled"
// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
file, err := os.Open(filename)
if err != nil {
@@ -158,6 +189,110 @@ func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported in
return c.parseAndStoreDomains(file, limit)
}
// ImportDomainsInBackground starts domain import in a background goroutine
func (c *Crawler) ImportDomainsInBackground(filename string) {
go func() {
file, err := os.Open(filename)
if err != nil {
fmt.Printf("Failed to open vertices file: %v\n", err)
return
}
defer file.Close()
var bodyReader io.Reader
bufReader := bufio.NewReader(file)
peekBytes, err := bufReader.Peek(2)
if err != nil && err != io.EOF {
fmt.Printf("Failed to peek at file: %v\n", err)
return
}
if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
gzReader, err := gzip.NewReader(bufReader)
if err != nil {
fmt.Printf("Failed to create gzip reader: %v\n", err)
return
}
defer gzReader.Close()
bodyReader = gzReader
} else {
bodyReader = bufReader
}
scanner := bufio.NewScanner(bodyReader)
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
const batchSize = 10000
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
totalImported := 0
batchCount := 0
type domainEntry struct {
host string
tld string
}
for {
// Read and canonicalize batch
var domains []domainEntry
for len(domains) < batchSize && scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
}
}
}
if len(domains) == 0 {
break
}
// Build bulk INSERT statement
var sb strings.Builder
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
args := make([]interface{}, 0, len(domains)*4)
for i, d := range domains {
if i > 0 {
sb.WriteString(",")
}
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, err := c.db.Exec(sb.String(), args...)
imported := 0
if err != nil {
fmt.Printf("Bulk insert error: %v\n", err)
} else {
rowsAffected, _ := result.RowsAffected()
imported = int(rowsAffected)
}
batchCount++
totalImported += imported
atomic.AddInt32(&c.domainsImported, int32(imported))
// Wait 1 second before the next batch
time.Sleep(1 * time.Second)
}
if err := scanner.Err(); err != nil {
fmt.Printf("Error reading vertices file: %v\n", err)
}
fmt.Printf("Background import complete: %d domains imported\n", totalImported)
}()
}
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
var bodyReader io.Reader
@@ -183,39 +318,63 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
scanner.Buffer(buf, 1024*1024)
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
count := 0
const batchSize = 1000
for scanner.Scan() {
if limit > 0 && count >= limit {
type domainEntry struct {
host string
tld string
}
for {
// Read and canonicalize batch
var domains []domainEntry
for len(domains) < batchSize && scanner.Scan() {
if limit > 0 && count >= limit {
break
}
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
count++
}
}
}
if len(domains) == 0 {
break
}
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
count++
// Skip if domain already exists
if c.domainExists(host) {
skipped++
continue
}
// Store new domain as uncrawled
domain := &Domain{
Host: host,
Status: "uncrawled",
DiscoveredAt: now,
TLD: getTLD(host),
}
if err := c.saveDomain(domain); err != nil {
continue
}
imported++
// Build bulk INSERT statement
var sb strings.Builder
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
args := make([]interface{}, 0, len(domains)*4)
for i, d := range domains {
if i > 0 {
sb.WriteString(",")
}
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, execErr := c.db.Exec(sb.String(), args...)
if execErr != nil {
skipped += len(domains)
continue
}
rowsAffected, _ := result.RowsAffected()
imported += int(rowsAffected)
skipped += len(domains) - int(rowsAffected)
if limit > 0 && count >= limit {
break
}
}
@@ -225,3 +384,18 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
return imported, skipped, nil
}
// Helper functions for SQL null handling
func nullTime(t time.Time) sql.NullTime {
if t.IsZero() {
return sql.NullTime{}
}
return sql.NullTime{Time: t, Valid: true}
}
func nullString(s string) sql.NullString {
if s == "" {
return sql.NullString{}
}
return sql.NullString{String: s, Valid: true}
}
+734 -69
View File
@@ -1,15 +1,86 @@
package main
import (
"encoding/json"
"database/sql"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync/atomic"
"time"
"github.com/cockroachdb/pebble"
)
// shouldSkipFeed checks if a feed URL should be filtered out
// Returns true (and a reason) if the feed should be skipped
func shouldSkipFeed(feedURL string) (bool, string) {
lower := strings.ToLower(feedURL)
// Skip explicit comment feeds
if strings.Contains(lower, "/comment") {
return true, "comment feed"
}
u, err := url.Parse(feedURL)
if err != nil {
return false, ""
}
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
// Skip category/tag feeds
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
for _, pattern := range categoryPatterns {
if strings.Contains(path, pattern) {
return true, "category/tag feed"
}
}
// Check for article comment feeds (path ending in /feed with content before it)
if strings.HasSuffix(path, "/feed") {
basePath := strings.TrimSuffix(path, "/feed")
basePath = strings.Trim(basePath, "/")
if basePath == "" {
return false, "" // Just /feed - legitimate main feed
}
// Skip if path contains date patterns (likely article)
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
return true, "article feed (date pattern)"
}
// Skip if path has multiple segments (likely article or nested content)
segments := strings.Split(basePath, "/")
if len(segments) >= 2 {
return true, "article feed (nested path)"
}
// Skip if single segment looks like an article slug (contains hyphens, is long)
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
return true, "article feed (slug pattern)"
}
}
return false, ""
}
// Item represents an individual entry/article from a feed
type Item struct {
ID int64 `json:"id,omitempty"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
@@ -50,99 +121,548 @@ type Feed struct {
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
}
// saveFeed stores a feed in PebbleDB
// saveFeed stores a feed in SQLite
func (c *Crawler) saveFeed(feed *Feed) error {
data, err := json.Marshal(feed)
if err != nil {
return fmt.Errorf("failed to marshal feed: %v", err)
}
key := []byte("feed:" + feed.URL)
return c.db.Set(key, data, pebble.Sync)
_, err := c.db.Exec(`
INSERT INTO feeds (
url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
type = excluded.type,
title = excluded.title,
description = excluded.description,
language = excluded.language,
siteUrl = excluded.siteUrl,
lastCrawledAt = excluded.lastCrawledAt,
nextCrawlAt = excluded.nextCrawlAt,
lastBuildDate = excluded.lastBuildDate,
etag = excluded.etag,
lastModified = excluded.lastModified,
ttlMinutes = excluded.ttlMinutes,
updatePeriod = excluded.updatePeriod,
updateFreq = excluded.updateFreq,
status = excluded.status,
errorCount = excluded.errorCount,
lastError = excluded.lastError,
lastErrorAt = excluded.lastErrorAt,
itemCount = excluded.itemCount,
avgPostFreqHrs = excluded.avgPostFreqHrs,
oldestItemDate = excluded.oldestItemDate,
newestItemDate = excluded.newestItemDate,
noUpdate = excluded.noUpdate
`,
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
nullString(feed.Language), nullString(feed.SiteURL),
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
nullString(feed.ETag), nullString(feed.LastModified),
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
feed.NoUpdate,
)
return err
}
// getFeed retrieves a feed from PebbleDB
// getFeed retrieves a feed from SQLite
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
key := []byte("feed:" + normalizeURL(feedURL))
data, closer, err := c.db.Get(key)
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE url = ?
`, normalizeURL(feedURL)).Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err
}
defer closer.Close()
var feed Feed
if err := json.Unmarshal(data, &feed); err != nil {
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
// Handle nullable fields
if title.Valid {
feed.Title = title.String
}
return &feed, nil
if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
return feed, nil
}
// feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool {
key := []byte("feed:" + normalizeURL(feedURL))
_, closer, err := c.db.Get(key)
if err != nil {
return false
}
closer.Close()
return true
var exists bool
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
return err == nil && exists
}
// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
var feeds []*Feed
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
`)
if err != nil {
return nil, err
}
defer iter.Close()
defer rows.Close()
for iter.First(); iter.Valid(); iter.Next() {
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
continue
}
feeds = append(feeds, &feed)
}
if err := iter.Error(); err != nil {
return nil, err
}
return feeds, nil
return scanFeeds(rows)
}
// GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) {
count := 0
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
return count, err
}
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
// GetFeedCountByHost returns the number of feeds for a specific host
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
return count, err
}
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
ORDER BY RANDOM()
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE sourceHost = ?
`, host)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
f.etag, f.lastModified,
f.ttlMinutes, f.updatePeriod, f.updateFreq,
f.status, f.errorCount, f.lastError, f.lastErrorAt,
f.sourceUrl, f.sourceHost, f.tld,
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
f.noUpdate
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
`, query)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// scanFeeds is a helper to scan multiple feed rows
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
var feeds []*Feed
for rows.Next() {
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
if err := rows.Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
); err != nil {
continue
}
// Handle nullable fields
if title.Valid {
feed.Title = title.String
}
if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
feeds = append(feeds, feed)
}
return feeds, rows.Err()
}
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
func (c *Crawler) saveItem(item *Item) error {
_, err := c.db.Exec(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`,
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
stmt, err := tx.Prepare(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`)
if err != nil {
return err
}
defer stmt.Close()
for _, item := range items {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
_, err := stmt.Exec(
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit()
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
FROM items
WHERE feedUrl = ?
ORDER BY pubDate DESC
LIMIT ?
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
`, cutoff)
if err != nil {
return 0, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
count++
}
if err := iter.Error(); err != nil {
return 0, err
}
return count, nil
return result.RowsAffected()
}
// processFeed parses and stores a feed with full metadata
@@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
LastModified: headers.Get("Last-Modified"),
}
// Parse feed-specific metadata
// Parse feed-specific metadata and items
var items []*Item
switch feedType {
case "rss":
c.parseRSSMetadata(body, feed)
items = c.parseRSSMetadata(body, feed)
case "atom":
c.parseAtomMetadata(body, feed)
items = c.parseAtomMetadata(body, feed)
}
// Calculate next crawl time
@@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
if err := c.saveFeed(feed); err != nil {
return
}
// Save items
if len(items) > 0 {
c.saveItems(items)
}
}
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
if strings.Contains(feedURL, "/comment") {
// Skip comment, category, and article feeds
if skip, _ := shouldSkipFeed(feedURL); skip {
return
}
@@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
return
}
}
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: changed (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
var successURL string
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
successURL = tryURL
break
}
}
_ = successURL // May be used later for logging/debugging
// If no request succeeded, resp will be nil
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
now := time.Now()
feed.LastCrawledAt = now
// 304 Not Modified - feed hasn't changed
if resp.StatusCode == http.StatusNotModified {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
c.saveFeed(feed)
return false, nil
}
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
feedType := c.detectFeedType(body)
feed.Type = feedType
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
}
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
// Save items
if len(items) > 0 {
c.saveItems(items)
}
return true, nil
}
+26 -7
View File
@@ -6,7 +6,13 @@ import (
)
func main() {
crawler, err := NewCrawler("feeds.db")
// Ensure feeds directory exists
if err := os.MkdirAll("feeds", 0755); err != nil {
fmt.Fprintf(os.Stderr, "Error creating feeds directory: %v\n", err)
os.Exit(1)
}
crawler, err := NewCrawler("feeds/feeds.db")
if err != nil {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1)
@@ -20,11 +26,24 @@ func main() {
}
}()
// Import domains from vertices file (only adds new ones as "uncrawled")
crawler.ImportDomainsFromFile("vertices.txt.gz", 0)
// Initialize stats in background (can be slow with large DBs)
go crawler.UpdateStats()
// Crawl all uncrawled domains (runs continuously)
for {
crawler.CrawlUncrawledDomains()
}
// Start all loops independently
fmt.Println("Starting import, crawl, check, and stats loops...")
// Import loop (background)
go crawler.ImportDomainsInBackground("vertices.txt.gz")
// Check loop (background)
go crawler.StartCheckLoop()
// Stats loop (background) - updates once per minute
go crawler.StartStatsLoop()
// Cleanup loop (background) - removes old items once per hour
go crawler.StartCleanupLoop()
// Crawl loop (foreground - blocks forever)
crawler.StartCrawlLoop()
}
+109 -16
View File
@@ -26,9 +26,14 @@ type RSSChannel struct {
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
PubDate string `xml:"pubDate"`
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
}
// Atom structs for parsing
@@ -40,10 +45,23 @@ type AtomFeed struct {
}
type AtomEntry struct {
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Updated string `xml:"updated"`
Published string `xml:"published"`
ID string `xml:"id"`
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
}
type AtomContent struct {
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type AtomAuthor struct {
Name string `xml:"name"`
}
type AtomLink struct {
@@ -52,10 +70,10 @@ type AtomLink struct {
Type string `xml:"type,attr"`
}
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
return
return nil
}
ch := rss.Channel
@@ -75,16 +93,47 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
}
}
// Analyze item dates
// Parse items
now := time.Now()
var items []*Item
var dates []time.Time
for _, item := range ch.Items {
if item.PubDate != "" {
if t, err := parseRSSDate(item.PubDate); err == nil {
for _, rssItem := range ch.Items {
item := &Item{
FeedURL: feed.URL,
Title: rssItem.Title,
Link: rssItem.Link,
Description: rssItem.Description,
Content: rssItem.Content,
DiscoveredAt: now,
}
// Use GUID if available, otherwise use link
if rssItem.GUID != "" {
item.GUID = rssItem.GUID
} else if rssItem.Link != "" {
item.GUID = rssItem.Link
}
// Author: prefer author, fall back to dc:creator
if rssItem.Author != "" {
item.Author = rssItem.Author
} else if rssItem.Creator != "" {
item.Author = rssItem.Creator
}
// Parse pubDate
if rssItem.PubDate != "" {
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
@@ -103,12 +152,14 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
var atom AtomFeed
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
return
return nil
}
feed.Title = atom.Title
@@ -131,20 +182,60 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
}
}
// Analyze entry dates
// Parse entries
now := time.Now()
var items []*Item
var dates []time.Time
for _, entry := range atom.Entries {
item := &Item{
FeedURL: feed.URL,
Title: entry.Title,
Author: entry.Author.Name,
DiscoveredAt: now,
}
// Use ID as GUID
if entry.ID != "" {
item.GUID = entry.ID
}
// Get link (prefer alternate, fall back to first link)
for _, link := range entry.Links {
if link.Rel == "" || link.Rel == "alternate" {
item.Link = link.Href
break
}
}
if item.Link == "" && len(entry.Links) > 0 {
item.Link = entry.Links[0].Href
}
// Use ID as GUID fallback if not set
if item.GUID == "" && item.Link != "" {
item.GUID = item.Link
}
// Summary/Content
item.Description = entry.Summary
item.Content = entry.Content.Value
// Parse dates
dateStr := entry.Updated
if dateStr == "" {
dateStr = entry.Published
}
if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
@@ -163,6 +254,8 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
// parseRSSDate attempts to parse various RSS date formats
+55
View File
@@ -0,0 +1,55 @@
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
background: #0a0a0a;
color: #ffffff;
padding: 20px;
line-height: 1.6;
}
h1 { color: #ffffff; margin-bottom: 20px; font-size: 24px; }
h2 { color: #ffffff; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
.card {
background: #151515;
border: 1px solid #252525;
border-radius: 8px;
padding: 15px;
}
.stat-value { font-size: 32px; font-weight: bold; color: #ffffff; }
.stat-label { font-size: 12px; color: #ffffff; text-transform: uppercase; }
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; color: #ffffff; }
.stat-row:last-child { border-bottom: none; }
.progress-bar {
background: #202020;
border-radius: 4px;
height: 8px;
margin-top: 10px;
overflow: hidden;
}
.progress-fill {
background: linear-gradient(90deg, #00aa55, #00cc66);
height: 100%;
transition: width 0.3s;
}
table { width: 100%; border-collapse: collapse; color: #ffffff; }
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
th { color: #ffffff; font-size: 11px; text-transform: uppercase; }
td { font-size: 13px; color: #ffffff; }
.type-rss { color: #f90; }
.type-atom { color: #09f; }
.type-unknown { color: #ffffff; }
.url {
max-width: 400px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
color: #4a9eff;
}
.time { color: #ffffff; font-size: 12px; }
.updated { color: #ffffff; font-size: 11px; text-align: right; margin-top: 20px; }
/* Search */
#searchInput:focus { outline: none; border-color: #0af; }
#searchInput::placeholder { color: #555; }
.search-host { margin-bottom: 10px; }
.search-feed:hover { background: #1a1a1a; }
+519
View File
@@ -0,0 +1,519 @@
function initDashboard() {
function commaFormat(n) {
return n.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
}
function escapeHtml(text) {
if (text == null) return '';
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// All domains state
let allDomainsOffset = 0;
let allDomainsLoading = false;
let allDomainsEnd = false;
let expandedDomain = null;
let expandedFeed = null;
const PAGE_SIZE = 100;
const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom
// Search state
let searchTimeout = null;
let isSearching = false;
async function loadMoreDomains() {
if (allDomainsLoading || allDomainsEnd) return;
allDomainsLoading = true;
const loadingEl = document.getElementById('allDomainsLoading');
loadingEl.style.display = 'block';
try {
const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE);
const domains = await response.json();
if (!domains || domains.length === 0) {
allDomainsEnd = true;
loadingEl.style.display = 'none';
return;
}
const container = document.getElementById('allDomains');
domains.forEach(d => {
const row = document.createElement('div');
row.className = 'domain-row';
row.innerHTML =
'<div class="stat-row" style="cursor: pointer;">' +
'<span>' + escapeHtml(d.host) + '</span>' +
'<span>' + commaFormat(d.feeds_found) + '</span>' +
'</div>' +
'<div class="domain-feeds" style="display: none;"></div>';
row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row));
container.appendChild(row);
});
allDomainsOffset += domains.length;
loadingEl.style.display = 'none';
// If we got fewer than PAGE_SIZE, we've reached the end
if (domains.length < PAGE_SIZE) {
allDomainsEnd = true;
}
} catch (err) {
console.error('Failed to load domains:', err);
} finally {
allDomainsLoading = false;
}
}
async function toggleDomainFeeds(host, rowEl) {
const feedsDiv = rowEl.querySelector('.domain-feeds');
// Close previously expanded domain
if (expandedDomain && expandedDomain !== rowEl) {
expandedDomain.querySelector('.domain-feeds').style.display = 'none';
}
// Toggle current
if (feedsDiv.style.display === 'none') {
feedsDiv.style.display = 'block';
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feeds...</div>';
expandedDomain = rowEl;
try {
const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host));
const feeds = await response.json();
if (!feeds || feeds.length === 0) {
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">No feeds found</div>';
} else {
feedsDiv.innerHTML = '';
feeds.forEach(f => {
const feedItem = document.createElement('div');
feedItem.className = 'feed-item';
feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;';
feedItem.innerHTML =
'<div class="feed-header">' +
'<div style="color: #0af;">' + escapeHtml(f.url) + '</div>' +
(f.title ? '<div style="color: #888; font-size: 0.9em;">' + escapeHtml(f.title) + '</div>' : '') +
'<div style="color: #666; font-size: 0.8em;">' + (f.type || 'unknown') + '</div>' +
'</div>' +
'<div class="feed-details" style="display: none;"></div>';
feedItem.querySelector('.feed-header').addEventListener('click', (e) => {
e.stopPropagation();
toggleFeedInfo(f.url, feedItem);
});
feedsDiv.appendChild(feedItem);
});
}
} catch (err) {
feedsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feeds</div>';
}
} else {
feedsDiv.style.display = 'none';
expandedDomain = null;
}
}
async function toggleFeedInfo(feedUrl, feedItemEl) {
const detailsDiv = feedItemEl.querySelector('.feed-details');
// Close previously expanded feed
if (expandedFeed && expandedFeed !== feedItemEl) {
expandedFeed.querySelector('.feed-details').style.display = 'none';
}
// Toggle current
if (detailsDiv.style.display === 'none') {
detailsDiv.style.display = 'block';
detailsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feed info...</div>';
expandedFeed = feedItemEl;
// Scroll the feed item to the top of the viewport
feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' });
try {
// Fetch feed info and items in parallel
const [infoResponse, itemsResponse] = await Promise.all([
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50')
]);
const info = await infoResponse.json();
const items = await itemsResponse.json();
let html = '<div style="padding: 10px; background: #1a1a1a; margin-top: 5px; border-radius: 4px; font-size: 0.85em;">';
if (info.description) {
html += '<div style="margin-bottom: 8px; color: #aaa;">' + escapeHtml(info.description) + '</div>';
}
html += '<table style="width: 100%; color: #888;">';
if (info.siteUrl) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
}
if (info.language) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
}
if (info.status) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
}
if (info.itemCount) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
}
if (info.avgPostFreqHrs) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Avg Post Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
}
if (info.ttlMinutes) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">TTL</td><td>' + info.ttlMinutes + ' min</td></tr>';
}
if (info.updatePeriod) {
let updateStr = info.updatePeriod;
if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')';
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Update</td><td>' + escapeHtml(updateStr) + '</td></tr>';
}
if (info.lastBuildDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Build</td><td>' + escapeHtml(info.lastBuildDate) + '</td></tr>';
}
if (info.newestItemDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Newest Item</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
}
if (info.oldestItemDate) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Oldest Item</td><td>' + escapeHtml(info.oldestItemDate) + '</td></tr>';
}
if (info.discoveredAt) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Discovered</td><td>' + escapeHtml(info.discoveredAt) + '</td></tr>';
}
if (info.lastCrawledAt) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Crawled</td><td>' + escapeHtml(info.lastCrawledAt) + '</td></tr>';
}
if (info.errorCount > 0) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Errors</td><td style="color: #f66;">' + info.errorCount + '</td></tr>';
}
if (info.lastError) {
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Error</td><td style="color: #f66;">' + escapeHtml(info.lastError) + '</td></tr>';
}
html += '</table>';
// Display items
if (items && items.length > 0) {
html += '<div style="margin-top: 12px; border-top: 1px solid #333; padding-top: 8px;">';
html += '<div style="color: #666; margin-bottom: 6px; font-weight: bold;">Recent Items (' + items.length + ')</div>';
items.forEach(item => {
html += '<div style="padding: 6px 0; border-bottom: 1px solid #222;">';
// Title with link
if (item.title) {
if (item.link) {
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.title) + '</a></div>';
} else {
html += '<div style="color: #ccc;">' + escapeHtml(item.title) + '</div>';
}
} else if (item.link) {
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.link) + '</a></div>';
}
// Metadata line (date, author)
let meta = [];
if (item.pub_date) {
const date = new Date(item.pub_date);
meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString());
}
if (item.author) {
meta.push(escapeHtml(item.author));
}
if (meta.length > 0) {
html += '<div style="color: #666; font-size: 0.85em;">' + meta.join(' • ') + '</div>';
}
html += '</div>';
});
html += '</div>';
}
html += '</div>';
detailsDiv.innerHTML = html;
} catch (err) {
detailsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feed info</div>';
}
} else {
detailsDiv.style.display = 'none';
expandedFeed = null;
}
}
// Infinite scroll handler with prefetch (uses window scroll)
function setupInfiniteScroll() {
window.addEventListener('scroll', () => {
// Check if we're near the bottom of the page
const scrollBottom = window.scrollY + window.innerHeight;
const docHeight = document.documentElement.scrollHeight;
const remainingPixels = docHeight - scrollBottom;
// Prefetch when within 500px of the bottom
if (remainingPixels < 500) {
loadMoreDomains();
}
});
}
// Search functionality
function setupSearch() {
const searchInput = document.getElementById('searchInput');
const searchResults = document.getElementById('searchResults');
const domainsContainer = document.getElementById('allDomainsContainer');
if (!searchInput || !searchResults || !domainsContainer) {
console.error('Search elements not found');
return;
}
searchInput.addEventListener('input', (e) => {
const query = e.target.value.trim();
// Clear previous timeout
if (searchTimeout) {
clearTimeout(searchTimeout);
}
// If empty, show domains list
if (!query) {
searchResults.style.display = 'none';
domainsContainer.style.display = 'block';
isSearching = false;
return;
}
// Debounce search
searchTimeout = setTimeout(() => performSearch(query), 300);
});
// Handle Enter key
searchInput.addEventListener('keydown', (e) => {
if (e.key === 'Enter') {
const query = e.target.value.trim();
if (query) {
if (searchTimeout) clearTimeout(searchTimeout);
performSearch(query);
}
}
});
}
async function performSearch(query) {
const searchResults = document.getElementById('searchResults');
const domainsContainer = document.getElementById('allDomainsContainer');
isSearching = true;
domainsContainer.style.display = 'none';
searchResults.style.display = 'block';
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">Searching...</div>';
try {
const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200');
const results = await response.json();
if (!results || results.length === 0) {
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">No results found</div>';
return;
}
// Group results by host
const byHost = {};
results.forEach(r => {
const host = r.feed.source_host || 'unknown';
if (!byHost[host]) {
byHost[host] = [];
}
byHost[host].push(r);
});
// Render results
searchResults.innerHTML = '';
Object.keys(byHost).sort().forEach(host => {
const hostDiv = document.createElement('div');
hostDiv.className = 'search-host';
// Host header
const hostHeader = document.createElement('div');
hostHeader.className = 'stat-row';
hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;';
hostHeader.innerHTML = '<span style="color: #0af;">' + escapeHtml(host) + '</span><span style="color: #666;">' + byHost[host].length + ' feed(s)</span>';
const feedsContainer = document.createElement('div');
feedsContainer.style.display = 'block';
byHost[host].forEach(result => {
const feedDiv = document.createElement('div');
feedDiv.className = 'search-feed';
feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;';
// Feed header
let feedHtml = '<div style="color: #0af; cursor: pointer;" class="feed-url">' + escapeHtml(result.feed.url) + '</div>';
if (result.feed.title) {
feedHtml += '<div style="color: #aaa; font-size: 0.9em;">' + escapeHtml(result.feed.title) + '</div>';
}
if (result.feed.description) {
feedHtml += '<div style="color: #666; font-size: 0.85em; margin-top: 2px;">' + escapeHtml(result.feed.description.substring(0, 200)) + '</div>';
}
// Items
if (result.items && result.items.length > 0) {
feedHtml += '<div class="search-items" style="margin-top: 8px; padding-left: 10px; border-left: 2px solid #333;">';
result.items.forEach(item => {
feedHtml += '<div style="padding: 4px 0; border-bottom: 1px solid #1a1a1a;">';
if (item.title) {
if (item.link) {
feedHtml += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #6cf; text-decoration: none;">' + escapeHtml(item.title) + '</a>';
} else {
feedHtml += '<span style="color: #ccc;">' + escapeHtml(item.title) + '</span>';
}
}
let meta = [];
if (item.pub_date) {
meta.push(item.pub_date.substring(0, 10));
}
if (item.author) {
meta.push(escapeHtml(item.author));
}
if (meta.length > 0) {
feedHtml += '<div style="color: #555; font-size: 0.8em;">' + meta.join(' • ') + '</div>';
}
feedHtml += '</div>';
});
feedHtml += '</div>';
}
feedDiv.innerHTML = feedHtml;
// Click on feed URL to toggle full feed info
feedDiv.querySelector('.feed-url').addEventListener('click', () => {
toggleSearchFeedInfo(result.feed.url, feedDiv);
});
feedsContainer.appendChild(feedDiv);
});
hostHeader.addEventListener('click', () => {
feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none';
});
hostDiv.appendChild(hostHeader);
hostDiv.appendChild(feedsContainer);
searchResults.appendChild(hostDiv);
});
} catch (err) {
console.error('Search failed:', err);
searchResults.innerHTML = '<div style="padding: 20px; color: #f66; text-align: center;">Search failed: ' + escapeHtml(err.message) + '</div>';
}
}
async function toggleSearchFeedInfo(feedUrl, feedDiv) {
let detailsDiv = feedDiv.querySelector('.feed-details-expanded');
if (detailsDiv) {
detailsDiv.remove();
return;
}
detailsDiv = document.createElement('div');
detailsDiv.className = 'feed-details-expanded';
detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;';
detailsDiv.innerHTML = '<div style="color: #666;">Loading feed info...</div>';
feedDiv.appendChild(detailsDiv);
try {
const [infoResponse, itemsResponse] = await Promise.all([
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20')
]);
const info = await infoResponse.json();
const items = await itemsResponse.json();
let html = '<table style="width: 100%; color: #888; font-size: 0.85em;">';
if (info.siteUrl) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
if (info.language) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
if (info.status) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
if (info.itemCount) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
if (info.avgPostFreqHrs) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Avg Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
if (info.newestItemDate) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Newest</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
html += '</table>';
if (items && items.length > 0) {
html += '<div style="margin-top: 10px; border-top: 1px solid #222; padding-top: 8px;">';
html += '<div style="color: #555; margin-bottom: 4px;">All Items (' + items.length + ')</div>';
items.forEach(item => {
html += '<div style="padding: 3px 0; border-bottom: 1px solid #1a1a1a;">';
if (item.title && item.link) {
html += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none; font-size: 0.9em;">' + escapeHtml(item.title) + '</a>';
} else if (item.title) {
html += '<span style="color: #aaa; font-size: 0.9em;">' + escapeHtml(item.title) + '</span>';
}
html += '</div>';
});
html += '</div>';
}
detailsDiv.innerHTML = html;
} catch (err) {
detailsDiv.innerHTML = '<div style="color: #f66;">Failed to load feed info</div>';
}
}
async function updateStats() {
try {
const response = await fetch('/api/stats');
const stats = await response.json();
// Update domain stats
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains);
document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains);
document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate);
document.getElementById('checkRate').textContent = commaFormat(stats.check_rate);
// Update progress bar
const progress = stats.total_domains > 0
? (stats.checked_domains * 100 / stats.total_domains).toFixed(1)
: 0;
document.getElementById('crawlProgress').style.width = progress + '%';
// Update feed stats
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
// Update timestamp
const updatedAt = new Date(stats.updated_at);
document.getElementById('updatedAt').textContent = 'Last updated: ' +
updatedAt.toISOString().replace('T', ' ').substring(0, 19);
} catch (err) {
console.error('Failed to update stats:', err);
}
}
// Initialize
try {
setupSearch();
} catch (e) {
console.error('setupSearch failed:', e);
}
setupInfiniteScroll();
loadMoreDomains();
updateStats();
setInterval(updateStats, 1000);
}
window.onload = initDashboard;