Add Docker support and refactor data layer
This commit is contained in:
@@ -0,0 +1,8 @@
|
|||||||
|
1440.news
|
||||||
|
1440.db
|
||||||
|
feeds/
|
||||||
|
*.gz
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.claude
|
||||||
|
CLAUDE.md
|
||||||
@@ -3,3 +3,4 @@ go.*
|
|||||||
*.gz
|
*.gz
|
||||||
feeds/
|
feeds/
|
||||||
feeds.db/
|
feeds.db/
|
||||||
|
1440.db
|
||||||
|
|||||||
+37
@@ -0,0 +1,37 @@
|
|||||||
|
FROM golang:1.24-alpine AS builder
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install build dependencies
|
||||||
|
RUN apk add --no-cache gcc musl-dev
|
||||||
|
|
||||||
|
# Copy go mod files first for layer caching
|
||||||
|
COPY go.mod go.sum ./
|
||||||
|
RUN go mod download
|
||||||
|
|
||||||
|
# Copy source code
|
||||||
|
COPY *.go ./
|
||||||
|
COPY static/ ./static/
|
||||||
|
|
||||||
|
# Build the binary
|
||||||
|
RUN CGO_ENABLED=1 go build -o 1440.news .
|
||||||
|
|
||||||
|
# Runtime stage
|
||||||
|
FROM alpine:latest
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install runtime dependencies
|
||||||
|
RUN apk add --no-cache ca-certificates tzdata
|
||||||
|
|
||||||
|
# Copy binary from builder
|
||||||
|
COPY --from=builder /app/1440.news .
|
||||||
|
COPY --from=builder /app/static ./static
|
||||||
|
|
||||||
|
# Create feeds directory
|
||||||
|
RUN mkdir -p feeds
|
||||||
|
|
||||||
|
# Expose dashboard port
|
||||||
|
EXPOSE 4321
|
||||||
|
|
||||||
|
CMD ["./1440.news"]
|
||||||
+106
-65
@@ -1,9 +1,9 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"math/rand"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -11,26 +11,33 @@ import (
|
|||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/cockroachdb/pebble"
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
MaxDepth int
|
MaxDepth int
|
||||||
MaxPagesPerHost int
|
MaxPagesPerHost int
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
UserAgent string
|
UserAgent string
|
||||||
visited sync.Map
|
visited sync.Map
|
||||||
feedsMu sync.Mutex
|
feedsMu sync.Mutex
|
||||||
client *http.Client
|
client *http.Client
|
||||||
hostsProcessed int32
|
hostsProcessed int32
|
||||||
db *pebble.DB
|
feedsChecked int32
|
||||||
|
startTime time.Time
|
||||||
|
db *sql.DB
|
||||||
|
displayedCrawlRate int
|
||||||
|
displayedCheckRate int
|
||||||
|
domainsImported int32
|
||||||
|
cachedStats *DashboardStats
|
||||||
|
cachedAllDomains []DomainStat
|
||||||
|
statsMu sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCrawler(dbPath string) (*Crawler, error) {
|
func NewCrawler(dbPath string) (*Crawler, error) {
|
||||||
db, err := pebble.Open(dbPath, &pebble.Options{})
|
db, err := OpenDatabase(dbPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to open pebble db: %v", err)
|
return nil, fmt.Errorf("failed to open database: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Crawler{
|
return &Crawler{
|
||||||
@@ -38,6 +45,7 @@ func NewCrawler(dbPath string) (*Crawler, error) {
|
|||||||
MaxPagesPerHost: 10,
|
MaxPagesPerHost: 10,
|
||||||
Timeout: 10 * time.Second,
|
Timeout: 10 * time.Second,
|
||||||
UserAgent: "FeedCrawler/1.0",
|
UserAgent: "FeedCrawler/1.0",
|
||||||
|
startTime: time.Now(),
|
||||||
db: db,
|
db: db,
|
||||||
client: &http.Client{
|
client: &http.Client{
|
||||||
Timeout: 10 * time.Second,
|
Timeout: 10 * time.Second,
|
||||||
@@ -58,87 +66,121 @@ func (c *Crawler) Close() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// CrawlUncrawledDomains fetches uncrawled domains and crawls them
|
// StartStatsLoop updates cached stats once per minute
|
||||||
func (c *Crawler) CrawlUncrawledDomains() error {
|
func (c *Crawler) StartStatsLoop() {
|
||||||
domains, err := c.GetUncrawledDomains()
|
for {
|
||||||
if err != nil {
|
c.UpdateStats()
|
||||||
return fmt.Errorf("failed to get uncrawled domains: %v", err)
|
time.Sleep(1 * time.Minute)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if len(domains) == 0 {
|
// StartCleanupLoop runs item cleanup once per week
|
||||||
return nil
|
func (c *Crawler) StartCleanupLoop() {
|
||||||
|
for {
|
||||||
|
deleted, err := c.CleanupOldItems()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Cleanup error: %v\n", err)
|
||||||
|
} else if deleted > 0 {
|
||||||
|
fmt.Printf("Cleanup: removed %d old items\n", deleted)
|
||||||
|
}
|
||||||
|
time.Sleep(7 * 24 * time.Hour)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Shuffle for randomized crawling
|
// StartCrawlLoop runs the domain crawling loop independently
|
||||||
rand.Shuffle(len(domains), func(i, j int) {
|
func (c *Crawler) StartCrawlLoop() {
|
||||||
domains[i], domains[j] = domains[j], domains[i]
|
numWorkers := runtime.NumCPU()
|
||||||
})
|
|
||||||
|
|
||||||
numWorkers := runtime.NumCPU() - 1
|
|
||||||
if numWorkers < 1 {
|
if numWorkers < 1 {
|
||||||
numWorkers = 1
|
numWorkers = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
type crawlResult struct {
|
// Buffered channel for domain work
|
||||||
host string
|
workChan := make(chan *Domain, 256)
|
||||||
feedsFound int
|
|
||||||
lastError string
|
|
||||||
}
|
|
||||||
|
|
||||||
domainChan := make(chan *Domain, numWorkers*2)
|
|
||||||
resultChan := make(chan crawlResult, numWorkers*2)
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
|
|
||||||
// Start workers
|
// Start workers
|
||||||
for i := 0; i < numWorkers; i++ {
|
for i := 0; i < numWorkers; i++ {
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
for domain := range workChan {
|
||||||
for domain := range domainChan {
|
|
||||||
feedsFound, crawlErr := c.crawlHost(domain.Host)
|
feedsFound, crawlErr := c.crawlHost(domain.Host)
|
||||||
errStr := ""
|
errStr := ""
|
||||||
if crawlErr != nil {
|
if crawlErr != nil {
|
||||||
errStr = crawlErr.Error()
|
errStr = crawlErr.Error()
|
||||||
}
|
}
|
||||||
resultChan <- crawlResult{
|
if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil {
|
||||||
host: domain.Host,
|
fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err)
|
||||||
feedsFound: feedsFound,
|
|
||||||
lastError: errStr,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start result processor
|
const fetchSize = 100
|
||||||
done := make(chan bool)
|
for {
|
||||||
go func() {
|
domains, err := c.GetUncheckedDomainsRandom(fetchSize)
|
||||||
for result := range resultChan {
|
if err != nil {
|
||||||
if err := c.markDomainCrawled(result.host, result.feedsFound, result.lastError); err != nil {
|
fmt.Printf("Error fetching domains: %v\n", err)
|
||||||
fmt.Printf("Error marking domain %s as crawled: %v\n", result.host, err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
done <- true
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Send domains to workers
|
if len(domains) == 0 {
|
||||||
for _, domain := range domains {
|
c.displayedCrawlRate = 0
|
||||||
domainChan <- domain
|
time.Sleep(1 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains))
|
||||||
|
|
||||||
|
for _, domain := range domains {
|
||||||
|
workChan <- domain
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// StartCheckLoop runs the feed checking loop independently
|
||||||
|
func (c *Crawler) StartCheckLoop() {
|
||||||
|
numWorkers := runtime.NumCPU()
|
||||||
|
if numWorkers < 1 {
|
||||||
|
numWorkers = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
close(domainChan)
|
// Buffered channel for feed work
|
||||||
wg.Wait()
|
workChan := make(chan *Feed, 256)
|
||||||
close(resultChan)
|
|
||||||
<-done
|
|
||||||
|
|
||||||
return nil
|
// Start workers
|
||||||
|
for i := 0; i < numWorkers; i++ {
|
||||||
|
go func() {
|
||||||
|
for feed := range workChan {
|
||||||
|
c.CheckFeed(feed)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
const fetchSize = 100
|
||||||
|
for {
|
||||||
|
feeds, err := c.GetFeedsDueForCheck(fetchSize)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Error fetching feeds: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(feeds) == 0 {
|
||||||
|
c.displayedCheckRate = 0
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds))
|
||||||
|
|
||||||
|
for _, feed := range feeds {
|
||||||
|
workChan <- feed
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
|
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
|
||||||
atomic.AddInt32(&c.hostsProcessed, 1)
|
atomic.AddInt32(&c.hostsProcessed, 1)
|
||||||
|
|
||||||
// Count feeds before crawling
|
|
||||||
initialCount, _ := c.GetFeedCount()
|
|
||||||
|
|
||||||
localVisited := make(map[string]bool)
|
localVisited := make(map[string]bool)
|
||||||
pagesVisited := 0
|
pagesVisited := 0
|
||||||
|
|
||||||
@@ -148,9 +190,8 @@ func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
|
|||||||
c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited)
|
c.crawlPage("http://"+host, host, 0, localVisited, &pagesVisited)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count feeds after crawling
|
// Count feeds found for this specific host
|
||||||
finalCount, _ := c.GetFeedCount()
|
feedsFound, _ = c.GetFeedCountByHost(host)
|
||||||
feedsFound = finalCount - initialCount
|
|
||||||
|
|
||||||
if pagesVisited == 0 {
|
if pagesVisited == 0 {
|
||||||
return feedsFound, fmt.Errorf("could not connect")
|
return feedsFound, fmt.Errorf("could not connect")
|
||||||
|
|||||||
+559
-239
@@ -1,21 +1,20 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html/template"
|
"html/template"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sort"
|
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// DashboardStats holds all statistics for the dashboard
|
// DashboardStats holds all statistics for the dashboard
|
||||||
type DashboardStats struct {
|
type DashboardStats struct {
|
||||||
// Domain stats
|
// Domain stats
|
||||||
TotalDomains int `json:"total_domains"`
|
TotalDomains int `json:"total_domains"`
|
||||||
CrawledDomains int `json:"crawled_domains"`
|
CheckedDomains int `json:"checked_domains"`
|
||||||
UncrawledDomains int `json:"uncrawled_domains"`
|
UncheckedDomains int `json:"unchecked_domains"`
|
||||||
ErrorDomains int `json:"error_domains"`
|
|
||||||
|
|
||||||
// Feed stats
|
// Feed stats
|
||||||
TotalFeeds int `json:"total_feeds"`
|
TotalFeeds int `json:"total_feeds"`
|
||||||
@@ -25,16 +24,8 @@ type DashboardStats struct {
|
|||||||
|
|
||||||
// Crawl progress
|
// Crawl progress
|
||||||
HostsProcessed int32 `json:"hosts_processed"`
|
HostsProcessed int32 `json:"hosts_processed"`
|
||||||
CrawlRate float64 `json:"crawl_rate"` // domains per minute
|
CrawlRate int `json:"crawl_rate"` // crawls per minute
|
||||||
|
CheckRate int `json:"check_rate"` // feed checks per minute
|
||||||
// Top TLDs by feed count
|
|
||||||
TopTLDs []TLDStat `json:"top_tlds"`
|
|
||||||
|
|
||||||
// Recent feeds
|
|
||||||
RecentFeeds []RecentFeed `json:"recent_feeds"`
|
|
||||||
|
|
||||||
// Top domains by feed count
|
|
||||||
TopDomains []DomainStat `json:"top_domains"`
|
|
||||||
|
|
||||||
// Timing
|
// Timing
|
||||||
UpdatedAt time.Time `json:"updated_at"`
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
@@ -57,13 +48,107 @@ type DomainStat struct {
|
|||||||
FeedsFound int `json:"feeds_found"`
|
FeedsFound int `json:"feeds_found"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetDashboardStats collects all statistics for the dashboard
|
// commaFormat formats an integer with comma separators
|
||||||
|
func commaFormat(n int) string {
|
||||||
|
s := fmt.Sprintf("%d", n)
|
||||||
|
if len(s) <= 3 {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
var result []byte
|
||||||
|
for i, c := range s {
|
||||||
|
if i > 0 && (len(s)-i)%3 == 0 {
|
||||||
|
result = append(result, ',')
|
||||||
|
}
|
||||||
|
result = append(result, byte(c))
|
||||||
|
}
|
||||||
|
return string(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateStats recalculates and caches dashboard statistics
|
||||||
|
func (c *Crawler) UpdateStats() {
|
||||||
|
fmt.Println("UpdateStats: calculating stats...")
|
||||||
|
stats, err := c.calculateStats()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("UpdateStats: error calculating stats: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Cache all domains with feeds (runs in background, so slow query is OK)
|
||||||
|
fmt.Println("UpdateStats: fetching all domains...")
|
||||||
|
allDomains := c.fetchAllDomainsFromDB()
|
||||||
|
fmt.Printf("UpdateStats: got %d domains\n", len(allDomains))
|
||||||
|
|
||||||
|
c.statsMu.Lock()
|
||||||
|
c.cachedStats = stats
|
||||||
|
c.cachedAllDomains = allDomains
|
||||||
|
c.statsMu.Unlock()
|
||||||
|
fmt.Println("UpdateStats: complete")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT tld, sourceHost, COUNT(*) as cnt FROM feeds
|
||||||
|
GROUP BY tld, sourceHost
|
||||||
|
ORDER BY tld, sourceHost
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var domains []DomainStat
|
||||||
|
for rows.Next() {
|
||||||
|
var ds DomainStat
|
||||||
|
var tld string
|
||||||
|
if err := rows.Scan(&tld, &ds.Host, &ds.FeedsFound); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
domains = append(domains, ds)
|
||||||
|
}
|
||||||
|
return domains
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDashboardStats returns cached statistics (returns empty stats if not yet cached)
|
||||||
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
|
func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
|
||||||
|
c.statsMu.RLock()
|
||||||
|
stats := c.cachedStats
|
||||||
|
c.statsMu.RUnlock()
|
||||||
|
|
||||||
|
if stats != nil {
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
// Return empty stats while background calculation runs (don't block HTTP requests)
|
||||||
|
return &DashboardStats{UpdatedAt: time.Now()}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculateStats collects all statistics for the dashboard
|
||||||
|
func (c *Crawler) calculateStats() (*DashboardStats, error) {
|
||||||
stats := &DashboardStats{
|
stats := &DashboardStats{
|
||||||
UpdatedAt: time.Now(),
|
UpdatedAt: time.Now(),
|
||||||
HostsProcessed: c.hostsProcessed,
|
HostsProcessed: c.hostsProcessed,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate crawl rate (crawls per minute), smoothed by +/-1 per update
|
||||||
|
elapsed := time.Since(c.startTime).Minutes()
|
||||||
|
if elapsed > 0 {
|
||||||
|
actualRate := int(float64(c.hostsProcessed) / elapsed)
|
||||||
|
if actualRate > c.displayedCrawlRate {
|
||||||
|
c.displayedCrawlRate++
|
||||||
|
} else if actualRate < c.displayedCrawlRate {
|
||||||
|
c.displayedCrawlRate--
|
||||||
|
}
|
||||||
|
stats.CrawlRate = c.displayedCrawlRate
|
||||||
|
|
||||||
|
// Calculate check rate (feed checks per minute), smoothed by +/-1 per update
|
||||||
|
actualCheckRate := int(float64(c.feedsChecked) / elapsed)
|
||||||
|
if actualCheckRate > c.displayedCheckRate {
|
||||||
|
c.displayedCheckRate++
|
||||||
|
} else if actualCheckRate < c.displayedCheckRate {
|
||||||
|
c.displayedCheckRate--
|
||||||
|
}
|
||||||
|
stats.CheckRate = c.displayedCheckRate
|
||||||
|
}
|
||||||
|
|
||||||
// Get domain stats
|
// Get domain stats
|
||||||
if err := c.collectDomainStats(stats); err != nil {
|
if err := c.collectDomainStats(stats); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -78,148 +163,455 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
|
func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
|
||||||
iter, err := c.db.NewIter(nil)
|
// Use MAX(rowid) for fast approximate total count
|
||||||
|
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM domains").Scan(&stats.TotalDomains)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
|
||||||
|
|
||||||
domainFeeds := make(map[string]int)
|
// Single query to get all status counts (one index scan instead of three)
|
||||||
|
rows, err := c.db.Query("SELECT status, COUNT(*) FROM domains GROUP BY status")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
for iter.SeekGE([]byte("domain:")); iter.Valid(); iter.Next() {
|
for rows.Next() {
|
||||||
key := string(iter.Key())
|
var status string
|
||||||
if len(key) < 7 || key[:7] != "domain:" {
|
var count int
|
||||||
break
|
if err := rows.Scan(&status, &count); err != nil {
|
||||||
}
|
|
||||||
|
|
||||||
var domain Domain
|
|
||||||
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
switch status {
|
||||||
stats.TotalDomains++
|
case "checked":
|
||||||
switch domain.Status {
|
stats.CheckedDomains = count
|
||||||
case "crawled":
|
case "unchecked":
|
||||||
stats.CrawledDomains++
|
stats.UncheckedDomains = count
|
||||||
if domain.FeedsFound > 0 {
|
|
||||||
domainFeeds[domain.Host] = domain.FeedsFound
|
|
||||||
}
|
|
||||||
case "uncrawled":
|
|
||||||
stats.UncrawledDomains++
|
|
||||||
case "error":
|
|
||||||
stats.ErrorDomains++
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
// Top domains by feed count
|
return err
|
||||||
type kv struct {
|
|
||||||
Host string
|
|
||||||
Count int
|
|
||||||
}
|
|
||||||
var sorted []kv
|
|
||||||
for h, c := range domainFeeds {
|
|
||||||
sorted = append(sorted, kv{h, c})
|
|
||||||
}
|
|
||||||
sort.Slice(sorted, func(i, j int) bool {
|
|
||||||
return sorted[i].Count > sorted[j].Count
|
|
||||||
})
|
|
||||||
for i := 0; i < len(sorted) && i < 10; i++ {
|
|
||||||
stats.TopDomains = append(stats.TopDomains, DomainStat{
|
|
||||||
Host: sorted[i].Host,
|
|
||||||
FeedsFound: sorted[i].Count,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return iter.Error()
|
return rows.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
|
func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
|
||||||
iter, err := c.db.NewIter(nil)
|
// Use MAX(rowid) for fast approximate total count
|
||||||
|
err := c.db.QueryRow("SELECT COALESCE(MAX(rowid), 0) FROM feeds").Scan(&stats.TotalFeeds)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
|
||||||
|
|
||||||
tldCounts := make(map[string]int)
|
// Single query to get all type counts (one index scan instead of three)
|
||||||
var recentFeeds []RecentFeed
|
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
for iter.SeekGE([]byte("feed:")); iter.Valid(); iter.Next() {
|
for rows.Next() {
|
||||||
key := string(iter.Key())
|
var feedType sql.NullString
|
||||||
if len(key) < 5 || key[:5] != "feed:" {
|
var count int
|
||||||
break
|
if err := rows.Scan(&feedType, &count); err != nil {
|
||||||
}
|
|
||||||
|
|
||||||
var feed Feed
|
|
||||||
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
switch feedType.String {
|
||||||
stats.TotalFeeds++
|
|
||||||
switch feed.Type {
|
|
||||||
case "rss":
|
case "rss":
|
||||||
stats.RSSFeeds++
|
stats.RSSFeeds = count
|
||||||
case "atom":
|
case "atom":
|
||||||
stats.AtomFeeds++
|
stats.AtomFeeds = count
|
||||||
default:
|
default:
|
||||||
stats.UnknownFeeds++
|
stats.UnknownFeeds += count
|
||||||
}
|
}
|
||||||
|
|
||||||
if feed.TLD != "" {
|
|
||||||
tldCounts[feed.TLD]++
|
|
||||||
}
|
|
||||||
|
|
||||||
recentFeeds = append(recentFeeds, RecentFeed{
|
|
||||||
URL: feed.URL,
|
|
||||||
Title: feed.Title,
|
|
||||||
Type: feed.Type,
|
|
||||||
DiscoveredAt: feed.DiscoveredAt,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
return rows.Err()
|
||||||
// Top TLDs
|
|
||||||
type kv struct {
|
|
||||||
TLD string
|
|
||||||
Count int
|
|
||||||
}
|
|
||||||
var sortedTLDs []kv
|
|
||||||
for t, c := range tldCounts {
|
|
||||||
sortedTLDs = append(sortedTLDs, kv{t, c})
|
|
||||||
}
|
|
||||||
sort.Slice(sortedTLDs, func(i, j int) bool {
|
|
||||||
return sortedTLDs[i].Count > sortedTLDs[j].Count
|
|
||||||
})
|
|
||||||
for i := 0; i < len(sortedTLDs) && i < 10; i++ {
|
|
||||||
stats.TopTLDs = append(stats.TopTLDs, TLDStat{
|
|
||||||
TLD: sortedTLDs[i].TLD,
|
|
||||||
Count: sortedTLDs[i].Count,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Recent feeds (last 20, sorted by discovery time)
|
|
||||||
sort.Slice(recentFeeds, func(i, j int) bool {
|
|
||||||
return recentFeeds[i].DiscoveredAt.After(recentFeeds[j].DiscoveredAt)
|
|
||||||
})
|
|
||||||
if len(recentFeeds) > 20 {
|
|
||||||
recentFeeds = recentFeeds[:20]
|
|
||||||
}
|
|
||||||
stats.RecentFeeds = recentFeeds
|
|
||||||
|
|
||||||
return iter.Error()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartDashboard starts the web dashboard server
|
// StartDashboard starts the web dashboard server
|
||||||
func (c *Crawler) StartDashboard(addr string) error {
|
func (c *Crawler) StartDashboard(addr string) error {
|
||||||
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
|
||||||
c.handleDashboard(w, r)
|
c.handleDashboard(w, r)
|
||||||
})
|
})
|
||||||
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
|
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
|
||||||
c.handleAPIStats(w, r)
|
c.handleAPIStats(w, r)
|
||||||
})
|
})
|
||||||
|
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPIAllDomains(w, r)
|
||||||
|
})
|
||||||
|
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPIDomainFeeds(w, r)
|
||||||
|
})
|
||||||
|
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPIFeedInfo(w, r)
|
||||||
|
})
|
||||||
|
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPIFeedItems(w, r)
|
||||||
|
})
|
||||||
|
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPISearch(w, r)
|
||||||
|
})
|
||||||
|
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
|
||||||
|
})
|
||||||
|
|
||||||
fmt.Printf("Dashboard running at http://%s\n", addr)
|
fmt.Printf("Dashboard running at http://%s\n", addr)
|
||||||
return http.ListenAndServe(addr, nil)
|
return http.ListenAndServe(addr, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
|
||||||
|
offset := 0
|
||||||
|
limit := 100
|
||||||
|
if o := r.URL.Query().Get("offset"); o != "" {
|
||||||
|
fmt.Sscanf(o, "%d", &offset)
|
||||||
|
}
|
||||||
|
if l := r.URL.Query().Get("limit"); l != "" {
|
||||||
|
fmt.Sscanf(l, "%d", &limit)
|
||||||
|
if limit > 100 {
|
||||||
|
limit = 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serve from cache (updated once per minute in background)
|
||||||
|
c.statsMu.RLock()
|
||||||
|
cached := c.cachedAllDomains
|
||||||
|
c.statsMu.RUnlock()
|
||||||
|
|
||||||
|
var domains []DomainStat
|
||||||
|
if cached != nil && offset < len(cached) {
|
||||||
|
end := offset + limit
|
||||||
|
if end > len(cached) {
|
||||||
|
end = len(cached)
|
||||||
|
}
|
||||||
|
domains = cached[offset:end]
|
||||||
|
}
|
||||||
|
if domains == nil {
|
||||||
|
domains = []DomainStat{}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(domains)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
|
||||||
|
host := r.URL.Query().Get("host")
|
||||||
|
if host == "" {
|
||||||
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT url, title, type FROM feeds
|
||||||
|
WHERE sourceHost = ?
|
||||||
|
ORDER BY url ASC
|
||||||
|
LIMIT 1000
|
||||||
|
`, host)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
type FeedInfo struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var feeds []FeedInfo
|
||||||
|
for rows.Next() {
|
||||||
|
var f FeedInfo
|
||||||
|
var title sql.NullString
|
||||||
|
if err := rows.Scan(&f.URL, &title, &f.Type); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if title.Valid {
|
||||||
|
f.Title = title.String
|
||||||
|
}
|
||||||
|
feeds = append(feeds, f)
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(feeds)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
||||||
|
feedURL := r.URL.Query().Get("url")
|
||||||
|
if feedURL == "" {
|
||||||
|
http.Error(w, "url parameter required", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
type FeedDetails struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Type string `json:"type,omitempty"`
|
||||||
|
Title string `json:"title,omitempty"`
|
||||||
|
Description string `json:"description,omitempty"`
|
||||||
|
Language string `json:"language,omitempty"`
|
||||||
|
SiteURL string `json:"siteUrl,omitempty"`
|
||||||
|
DiscoveredAt string `json:"discoveredAt,omitempty"`
|
||||||
|
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
|
||||||
|
LastBuildDate string `json:"lastBuildDate,omitempty"`
|
||||||
|
TTLMinutes int `json:"ttlMinutes,omitempty"`
|
||||||
|
UpdatePeriod string `json:"updatePeriod,omitempty"`
|
||||||
|
UpdateFreq int `json:"updateFreq,omitempty"`
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
ErrorCount int `json:"errorCount,omitempty"`
|
||||||
|
LastError string `json:"lastError,omitempty"`
|
||||||
|
ItemCount int `json:"itemCount,omitempty"`
|
||||||
|
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
|
||||||
|
OldestItemDate string `json:"oldestItemDate,omitempty"`
|
||||||
|
NewestItemDate string `json:"newestItemDate,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var f FeedDetails
|
||||||
|
var title, description, language, siteUrl, lastCrawledAt, lastBuildDate sql.NullString
|
||||||
|
var updatePeriod, status, lastError, oldestItemDate, newestItemDate sql.NullString
|
||||||
|
var ttlMinutes, updateFreq, errorCount, itemCount sql.NullInt64
|
||||||
|
var avgPostFreqHrs sql.NullFloat64
|
||||||
|
|
||||||
|
err := c.db.QueryRow(`
|
||||||
|
SELECT url, type, title, description, language, siteUrl,
|
||||||
|
discoveredAt, lastCrawledAt, lastBuildDate,
|
||||||
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
|
status, errorCount, lastError,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate
|
||||||
|
FROM feeds WHERE url = ?
|
||||||
|
`, feedURL).Scan(
|
||||||
|
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
|
||||||
|
&f.DiscoveredAt, &lastCrawledAt, &lastBuildDate,
|
||||||
|
&ttlMinutes, &updatePeriod, &updateFreq,
|
||||||
|
&status, &errorCount, &lastError,
|
||||||
|
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||||
|
)
|
||||||
|
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
http.Error(w, "feed not found", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if title.Valid {
|
||||||
|
f.Title = title.String
|
||||||
|
}
|
||||||
|
if description.Valid {
|
||||||
|
f.Description = description.String
|
||||||
|
}
|
||||||
|
if language.Valid {
|
||||||
|
f.Language = language.String
|
||||||
|
}
|
||||||
|
if siteUrl.Valid {
|
||||||
|
f.SiteURL = siteUrl.String
|
||||||
|
}
|
||||||
|
if lastCrawledAt.Valid {
|
||||||
|
f.LastCrawledAt = lastCrawledAt.String
|
||||||
|
}
|
||||||
|
if lastBuildDate.Valid {
|
||||||
|
f.LastBuildDate = lastBuildDate.String
|
||||||
|
}
|
||||||
|
if ttlMinutes.Valid {
|
||||||
|
f.TTLMinutes = int(ttlMinutes.Int64)
|
||||||
|
}
|
||||||
|
if updatePeriod.Valid {
|
||||||
|
f.UpdatePeriod = updatePeriod.String
|
||||||
|
}
|
||||||
|
if updateFreq.Valid {
|
||||||
|
f.UpdateFreq = int(updateFreq.Int64)
|
||||||
|
}
|
||||||
|
if status.Valid {
|
||||||
|
f.Status = status.String
|
||||||
|
}
|
||||||
|
if errorCount.Valid {
|
||||||
|
f.ErrorCount = int(errorCount.Int64)
|
||||||
|
}
|
||||||
|
if lastError.Valid {
|
||||||
|
f.LastError = lastError.String
|
||||||
|
}
|
||||||
|
if itemCount.Valid {
|
||||||
|
f.ItemCount = int(itemCount.Int64)
|
||||||
|
}
|
||||||
|
if avgPostFreqHrs.Valid {
|
||||||
|
f.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
||||||
|
}
|
||||||
|
if oldestItemDate.Valid {
|
||||||
|
f.OldestItemDate = oldestItemDate.String
|
||||||
|
}
|
||||||
|
if newestItemDate.Valid {
|
||||||
|
f.NewestItemDate = newestItemDate.String
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
|
||||||
|
feedURL := r.URL.Query().Get("url")
|
||||||
|
if feedURL == "" {
|
||||||
|
http.Error(w, "url parameter required", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
limit := 50
|
||||||
|
if l := r.URL.Query().Get("limit"); l != "" {
|
||||||
|
fmt.Sscanf(l, "%d", &limit)
|
||||||
|
if limit > 100 {
|
||||||
|
limit = 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
items, err := c.GetItemsByFeed(feedURL, limit)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if items == nil {
|
||||||
|
items = []*Item{}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchResult represents a search result with feed and matching items
|
||||||
|
type SearchResult struct {
|
||||||
|
Feed SearchFeed `json:"feed"`
|
||||||
|
Items []SearchItem `json:"items"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SearchFeed struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
SourceHost string `json:"source_host"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SearchItem struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Link string `json:"link"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
Author string `json:"author"`
|
||||||
|
PubDate string `json:"pub_date"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
||||||
|
query := r.URL.Query().Get("q")
|
||||||
|
if query == "" {
|
||||||
|
http.Error(w, "q parameter required", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
limit := 100
|
||||||
|
if l := r.URL.Query().Get("limit"); l != "" {
|
||||||
|
fmt.Sscanf(l, "%d", &limit)
|
||||||
|
if limit > 500 {
|
||||||
|
limit = 500
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Results map: feedURL -> SearchResult
|
||||||
|
results := make(map[string]*SearchResult)
|
||||||
|
|
||||||
|
// Search feeds
|
||||||
|
feedRows, err := c.db.Query(`
|
||||||
|
SELECT f.url, f.title, f.description, f.type, f.sourceHost, f.status
|
||||||
|
FROM feeds f
|
||||||
|
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
||||||
|
WHERE feeds_fts MATCH ?
|
||||||
|
LIMIT ?
|
||||||
|
`, query, limit)
|
||||||
|
if err == nil {
|
||||||
|
defer feedRows.Close()
|
||||||
|
for feedRows.Next() {
|
||||||
|
var url string
|
||||||
|
var title, description, feedType, sourceHost, status sql.NullString
|
||||||
|
if err := feedRows.Scan(&url, &title, &description, &feedType, &sourceHost, &status); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
results[url] = &SearchResult{
|
||||||
|
Feed: SearchFeed{
|
||||||
|
URL: url,
|
||||||
|
Title: title.String,
|
||||||
|
Description: description.String,
|
||||||
|
Type: feedType.String,
|
||||||
|
SourceHost: sourceHost.String,
|
||||||
|
Status: status.String,
|
||||||
|
},
|
||||||
|
Items: []SearchItem{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search items
|
||||||
|
itemRows, err := c.db.Query(`
|
||||||
|
SELECT i.id, i.feedUrl, i.title, i.link, i.description, i.author, i.pubDate
|
||||||
|
FROM items i
|
||||||
|
JOIN items_fts fts ON i.id = fts.rowid
|
||||||
|
WHERE items_fts MATCH ?
|
||||||
|
ORDER BY i.pubDate DESC
|
||||||
|
LIMIT ?
|
||||||
|
`, query, limit)
|
||||||
|
if err == nil {
|
||||||
|
defer itemRows.Close()
|
||||||
|
for itemRows.Next() {
|
||||||
|
var id int64
|
||||||
|
var feedUrl string
|
||||||
|
var title, link, description, author, pubDate sql.NullString
|
||||||
|
if err := itemRows.Scan(&id, &feedUrl, &title, &link, &description, &author, &pubDate); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
item := SearchItem{
|
||||||
|
ID: id,
|
||||||
|
Title: title.String,
|
||||||
|
Link: link.String,
|
||||||
|
Description: description.String,
|
||||||
|
Author: author.String,
|
||||||
|
PubDate: pubDate.String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to existing result or create new one
|
||||||
|
if result, exists := results[feedUrl]; exists {
|
||||||
|
result.Items = append(result.Items, item)
|
||||||
|
} else {
|
||||||
|
// Fetch feed info for this item's feed
|
||||||
|
var fTitle, fDesc, fType, fHost, fStatus sql.NullString
|
||||||
|
c.db.QueryRow(`
|
||||||
|
SELECT title, description, type, sourceHost, status
|
||||||
|
FROM feeds WHERE url = ?
|
||||||
|
`, feedUrl).Scan(&fTitle, &fDesc, &fType, &fHost, &fStatus)
|
||||||
|
|
||||||
|
results[feedUrl] = &SearchResult{
|
||||||
|
Feed: SearchFeed{
|
||||||
|
URL: feedUrl,
|
||||||
|
Title: fTitle.String,
|
||||||
|
Description: fDesc.String,
|
||||||
|
Type: fType.String,
|
||||||
|
SourceHost: fHost.String,
|
||||||
|
Status: fStatus.String,
|
||||||
|
},
|
||||||
|
Items: []SearchItem{item},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert map to slice
|
||||||
|
var resultList []SearchResult
|
||||||
|
for _, r := range results {
|
||||||
|
resultList = append(resultList, *r)
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(resultList)
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
||||||
stats, err := c.GetDashboardStats()
|
stats, err := c.GetDashboardStats()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -228,14 +620,28 @@ func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
funcMap := template.FuncMap{
|
funcMap := template.FuncMap{
|
||||||
"divf": func(a, b int) float64 {
|
"pct": func(a, b int) float64 {
|
||||||
if b == 0 {
|
if b == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
return float64(a) / float64(b)
|
return float64(a) * 100.0 / float64(b)
|
||||||
},
|
},
|
||||||
"mulf": func(a int, b float64) float64 {
|
"comma": func(n interface{}) string {
|
||||||
return float64(a) * b
|
var val int
|
||||||
|
switch v := n.(type) {
|
||||||
|
case int:
|
||||||
|
val = v
|
||||||
|
case int32:
|
||||||
|
val = int(v)
|
||||||
|
case int64:
|
||||||
|
val = int(v)
|
||||||
|
default:
|
||||||
|
return "0"
|
||||||
|
}
|
||||||
|
if val < 0 {
|
||||||
|
return "-" + commaFormat(-val)
|
||||||
|
}
|
||||||
|
return commaFormat(val)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -265,58 +671,8 @@ const dashboardHTML = `<!DOCTYPE html>
|
|||||||
<head>
|
<head>
|
||||||
<title>1440.news Feed Crawler</title>
|
<title>1440.news Feed Crawler</title>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<meta http-equiv="refresh" content="5">
|
<link rel="stylesheet" href="/static/dashboard.css">
|
||||||
<style>
|
<script src="/static/dashboard.js"></script>
|
||||||
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
||||||
body {
|
|
||||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
|
|
||||||
background: #0a0a0a;
|
|
||||||
color: #e0e0e0;
|
|
||||||
padding: 20px;
|
|
||||||
line-height: 1.6;
|
|
||||||
}
|
|
||||||
h1 { color: #fff; margin-bottom: 20px; font-size: 24px; }
|
|
||||||
h2 { color: #888; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
|
|
||||||
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
|
|
||||||
.card {
|
|
||||||
background: #151515;
|
|
||||||
border: 1px solid #252525;
|
|
||||||
border-radius: 8px;
|
|
||||||
padding: 15px;
|
|
||||||
}
|
|
||||||
.stat-value { font-size: 32px; font-weight: bold; color: #fff; }
|
|
||||||
.stat-label { font-size: 12px; color: #666; text-transform: uppercase; }
|
|
||||||
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; }
|
|
||||||
.stat-row:last-child { border-bottom: none; }
|
|
||||||
.progress-bar {
|
|
||||||
background: #202020;
|
|
||||||
border-radius: 4px;
|
|
||||||
height: 8px;
|
|
||||||
margin-top: 10px;
|
|
||||||
overflow: hidden;
|
|
||||||
}
|
|
||||||
.progress-fill {
|
|
||||||
background: linear-gradient(90deg, #00aa55, #00cc66);
|
|
||||||
height: 100%;
|
|
||||||
transition: width 0.3s;
|
|
||||||
}
|
|
||||||
table { width: 100%; border-collapse: collapse; }
|
|
||||||
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
|
|
||||||
th { color: #666; font-size: 11px; text-transform: uppercase; }
|
|
||||||
td { font-size: 13px; }
|
|
||||||
.type-rss { color: #f90; }
|
|
||||||
.type-atom { color: #09f; }
|
|
||||||
.type-unknown { color: #666; }
|
|
||||||
.url {
|
|
||||||
max-width: 400px;
|
|
||||||
overflow: hidden;
|
|
||||||
text-overflow: ellipsis;
|
|
||||||
white-space: nowrap;
|
|
||||||
color: #4a9eff;
|
|
||||||
}
|
|
||||||
.time { color: #666; font-size: 12px; }
|
|
||||||
.updated { color: #444; font-size: 11px; text-align: right; margin-top: 20px; }
|
|
||||||
</style>
|
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h1>1440.news Feed Crawler</h1>
|
<h1>1440.news Feed Crawler</h1>
|
||||||
@@ -324,99 +680,63 @@ const dashboardHTML = `<!DOCTYPE html>
|
|||||||
<h2>Crawl Progress</h2>
|
<h2>Crawl Progress</h2>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value">{{.TotalDomains}}</div>
|
<div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
|
||||||
<div class="stat-label">Total Domains</div>
|
<div class="stat-label">Domains</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value">{{.CrawledDomains}}</div>
|
<div class="stat-value" id="checkedDomains">{{comma .CheckedDomains}}</div>
|
||||||
<div class="stat-label">Crawled</div>
|
<div class="stat-label">Checked</div>
|
||||||
{{if .TotalDomains}}
|
|
||||||
<div class="progress-bar">
|
<div class="progress-bar">
|
||||||
<div class="progress-fill" style="width: {{printf "%.1f" (divf (mulf .CrawledDomains 100.0) .TotalDomains)}}%"></div>
|
<div class="progress-fill" id="crawlProgress" style="width: {{printf "%.1f" (pct .CheckedDomains .TotalDomains)}}%"></div>
|
||||||
</div>
|
</div>
|
||||||
{{end}}
|
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value">{{.UncrawledDomains}}</div>
|
<div class="stat-value" id="uncheckedDomains">{{comma .UncheckedDomains}}</div>
|
||||||
<div class="stat-label">Uncrawled</div>
|
<div class="stat-label">Unchecked</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value">{{.ErrorDomains}}</div>
|
<div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
|
||||||
<div class="stat-label">Errors</div>
|
<div class="stat-label">crawls per min</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
|
||||||
|
<div class="stat-label">checks per min</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<h2>Feeds Discovered</h2>
|
<h2>Feeds Discovered</h2>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value">{{.TotalFeeds}}</div>
|
<div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
|
||||||
<div class="stat-label">Total Feeds</div>
|
<div class="stat-label">Total Feeds</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value" style="color: #f90">{{.RSSFeeds}}</div>
|
<div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
|
||||||
<div class="stat-label">RSS Feeds</div>
|
<div class="stat-label">RSS Feeds</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value" style="color: #09f">{{.AtomFeeds}}</div>
|
<div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
|
||||||
<div class="stat-label">Atom Feeds</div>
|
<div class="stat-label">Atom Feeds</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="stat-value" style="color: #666">{{.UnknownFeeds}}</div>
|
<div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
|
||||||
<div class="stat-label">Unknown Type</div>
|
<div class="stat-label">Unknown Type</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="grid" style="grid-template-columns: 1fr 1fr;">
|
|
||||||
<div class="card">
|
|
||||||
<h2 style="margin-top: 0;">Top TLDs</h2>
|
|
||||||
{{range .TopTLDs}}
|
|
||||||
<div class="stat-row">
|
|
||||||
<span>.{{.TLD}}</span>
|
|
||||||
<span>{{.Count}}</span>
|
|
||||||
</div>
|
|
||||||
{{else}}
|
|
||||||
<div style="color: #444;">No data yet</div>
|
|
||||||
{{end}}
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<h2 style="margin-top: 0;">Top Domains</h2>
|
|
||||||
{{range .TopDomains}}
|
|
||||||
<div class="stat-row">
|
|
||||||
<span>{{.Host}}</span>
|
|
||||||
<span>{{.FeedsFound}}</span>
|
|
||||||
</div>
|
|
||||||
{{else}}
|
|
||||||
<div style="color: #444;">No data yet</div>
|
|
||||||
{{end}}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<h2>Recent Feeds</h2>
|
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<table>
|
<h2 style="margin-top: 0;">Feeds</h2>
|
||||||
<thead>
|
<div style="margin-bottom: 15px;">
|
||||||
<tr>
|
<input type="text" id="searchInput" placeholder="Search feeds and items..."
|
||||||
<th>URL</th>
|
style="width: 100%; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px;">
|
||||||
<th>Title</th>
|
</div>
|
||||||
<th>Type</th>
|
<div id="searchResults" style="display: none;"></div>
|
||||||
<th>Discovered</th>
|
<div id="allDomainsContainer">
|
||||||
</tr>
|
<div id="allDomains"></div>
|
||||||
</thead>
|
<div id="allDomainsLoading" style="text-align: center; padding: 10px; color: #666;">Loading...</div>
|
||||||
<tbody>
|
</div>
|
||||||
{{range .RecentFeeds}}
|
|
||||||
<tr>
|
|
||||||
<td class="url">{{.URL}}</td>
|
|
||||||
<td>{{if .Title}}{{.Title}}{{else}}-{{end}}</td>
|
|
||||||
<td class="type-{{.Type}}">{{.Type}}</td>
|
|
||||||
<td class="time">{{.DiscoveredAt.Format "15:04:05"}}</td>
|
|
||||||
</tr>
|
|
||||||
{{else}}
|
|
||||||
<tr><td colspan="4" style="color: #444;">No feeds discovered yet</td></tr>
|
|
||||||
{{end}}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="updated">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
|
<div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
|
||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|||||||
@@ -0,0 +1,192 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
|
)
|
||||||
|
|
||||||
|
const schema = `
|
||||||
|
CREATE TABLE IF NOT EXISTS domains (
|
||||||
|
host TEXT PRIMARY KEY,
|
||||||
|
status TEXT NOT NULL DEFAULT 'unchecked',
|
||||||
|
discoveredAt DATETIME NOT NULL,
|
||||||
|
lastCrawledAt DATETIME,
|
||||||
|
feedsFound INTEGER DEFAULT 0,
|
||||||
|
lastError TEXT,
|
||||||
|
tld TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS feeds (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
type TEXT,
|
||||||
|
title TEXT,
|
||||||
|
description TEXT,
|
||||||
|
language TEXT,
|
||||||
|
siteUrl TEXT,
|
||||||
|
|
||||||
|
discoveredAt DATETIME NOT NULL,
|
||||||
|
lastCrawledAt DATETIME,
|
||||||
|
nextCrawlAt DATETIME,
|
||||||
|
lastBuildDate DATETIME,
|
||||||
|
|
||||||
|
etag TEXT,
|
||||||
|
lastModified TEXT,
|
||||||
|
|
||||||
|
ttlMinutes INTEGER,
|
||||||
|
updatePeriod TEXT,
|
||||||
|
updateFreq INTEGER,
|
||||||
|
|
||||||
|
status TEXT DEFAULT 'active',
|
||||||
|
errorCount INTEGER DEFAULT 0,
|
||||||
|
lastError TEXT,
|
||||||
|
lastErrorAt DATETIME,
|
||||||
|
|
||||||
|
sourceUrl TEXT,
|
||||||
|
sourceHost TEXT,
|
||||||
|
tld TEXT,
|
||||||
|
|
||||||
|
itemCount INTEGER,
|
||||||
|
avgPostFreqHrs REAL,
|
||||||
|
oldestItemDate DATETIME,
|
||||||
|
newestItemDate DATETIME,
|
||||||
|
|
||||||
|
noUpdate INTEGER DEFAULT 0
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS items (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
feedUrl TEXT NOT NULL,
|
||||||
|
guid TEXT,
|
||||||
|
title TEXT,
|
||||||
|
link TEXT,
|
||||||
|
description TEXT,
|
||||||
|
content TEXT,
|
||||||
|
author TEXT,
|
||||||
|
pubDate DATETIME,
|
||||||
|
discoveredAt DATETIME NOT NULL,
|
||||||
|
updatedAt DATETIME,
|
||||||
|
UNIQUE(feedUrl, guid)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
|
||||||
|
|
||||||
|
-- Full-text search for feeds
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
content='feeds',
|
||||||
|
content_rowid='rowid'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Triggers to keep FTS in sync
|
||||||
|
CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN
|
||||||
|
INSERT INTO feeds_fts(rowid, url, title, description)
|
||||||
|
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN
|
||||||
|
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
|
||||||
|
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN
|
||||||
|
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
|
||||||
|
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
|
||||||
|
INSERT INTO feeds_fts(rowid, url, title, description)
|
||||||
|
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- Full-text search for items
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5(
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
content,
|
||||||
|
author,
|
||||||
|
content='items',
|
||||||
|
content_rowid='id'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Triggers to keep items FTS in sync
|
||||||
|
CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN
|
||||||
|
INSERT INTO items_fts(rowid, title, description, content, author)
|
||||||
|
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN
|
||||||
|
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
|
||||||
|
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN
|
||||||
|
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
|
||||||
|
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
|
||||||
|
INSERT INTO items_fts(rowid, title, description, content, author)
|
||||||
|
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
|
||||||
|
END;
|
||||||
|
`
|
||||||
|
|
||||||
|
func OpenDatabase(dbPath string) (*sql.DB, error) {
|
||||||
|
fmt.Printf("Opening database: %s\n", dbPath)
|
||||||
|
|
||||||
|
// Use pragmas in connection string for consistent application
|
||||||
|
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
|
||||||
|
db, err := sql.Open("sqlite", connStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to open database: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allow multiple readers (WAL mode supports concurrent reads)
|
||||||
|
// SQLite is single-writer, but reads can happen concurrently
|
||||||
|
db.SetMaxOpenConns(4)
|
||||||
|
|
||||||
|
// Verify connection and show journal mode
|
||||||
|
var journalMode string
|
||||||
|
if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil {
|
||||||
|
fmt.Printf(" Warning: could not query journal_mode: %v\n", err)
|
||||||
|
} else {
|
||||||
|
fmt.Printf(" Journal mode: %s\n", journalMode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create schema
|
||||||
|
if _, err := db.Exec(schema); err != nil {
|
||||||
|
db.Close()
|
||||||
|
return nil, fmt.Errorf("failed to create schema: %v", err)
|
||||||
|
}
|
||||||
|
fmt.Println(" Schema OK")
|
||||||
|
|
||||||
|
// Run stats and ANALYZE in background to avoid blocking startup with large databases
|
||||||
|
go func() {
|
||||||
|
var domainCount, feedCount int
|
||||||
|
db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount)
|
||||||
|
db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount)
|
||||||
|
fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount)
|
||||||
|
|
||||||
|
fmt.Println(" Running ANALYZE...")
|
||||||
|
if _, err := db.Exec("ANALYZE"); err != nil {
|
||||||
|
fmt.Printf(" Warning: ANALYZE failed: %v\n", err)
|
||||||
|
} else {
|
||||||
|
fmt.Println(" ANALYZE complete")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
@@ -3,20 +3,19 @@ package main
|
|||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"encoding/json"
|
"database/sql"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/cockroachdb/pebble"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Domain represents a host to be crawled for feeds
|
// Domain represents a host to be crawled for feeds
|
||||||
type Domain struct {
|
type Domain struct {
|
||||||
Host string `json:"host"` // Normalized hostname (no scheme, no www.)
|
Host string `json:"host"`
|
||||||
Status string `json:"status"` // "uncrawled", "crawled", "error"
|
Status string `json:"status"`
|
||||||
DiscoveredAt time.Time `json:"discovered_at"`
|
DiscoveredAt time.Time `json:"discovered_at"`
|
||||||
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
||||||
FeedsFound int `json:"feeds_found,omitempty"`
|
FeedsFound int `json:"feeds_found,omitempty"`
|
||||||
@@ -24,130 +23,162 @@ type Domain struct {
|
|||||||
TLD string `json:"tld,omitempty"`
|
TLD string `json:"tld,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// saveDomain stores a domain in PebbleDB
|
// saveDomain stores a domain in SQLite
|
||||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||||
data, err := json.Marshal(domain)
|
_, err := c.db.Exec(`
|
||||||
if err != nil {
|
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
|
||||||
return fmt.Errorf("failed to marshal domain: %v", err)
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
}
|
ON CONFLICT(host) DO UPDATE SET
|
||||||
|
status = excluded.status,
|
||||||
key := []byte("domain:" + domain.Host)
|
lastCrawledAt = excluded.lastCrawledAt,
|
||||||
return c.db.Set(key, data, pebble.Sync)
|
feedsFound = excluded.feedsFound,
|
||||||
|
lastError = excluded.lastError,
|
||||||
|
tld = excluded.tld
|
||||||
|
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
|
||||||
|
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// getDomain retrieves a domain from PebbleDB
|
// saveDomainTx stores a domain using a transaction
|
||||||
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error {
|
||||||
key := []byte("domain:" + normalizeHost(host))
|
_, err := tx.Exec(`
|
||||||
data, closer, err := c.db.Get(key)
|
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
|
||||||
if err != nil {
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
if err == pebble.ErrNotFound {
|
ON CONFLICT(host) DO NOTHING
|
||||||
return nil, nil
|
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
|
||||||
}
|
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
|
||||||
return nil, err
|
return err
|
||||||
}
|
|
||||||
defer closer.Close()
|
|
||||||
|
|
||||||
var domain Domain
|
|
||||||
if err := json.Unmarshal(data, &domain); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to unmarshal domain: %v", err)
|
|
||||||
}
|
|
||||||
return &domain, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// domainExists checks if a domain already exists in the database
|
// domainExists checks if a domain already exists in the database
|
||||||
func (c *Crawler) domainExists(host string) bool {
|
func (c *Crawler) domainExists(host string) bool {
|
||||||
key := []byte("domain:" + normalizeHost(host))
|
var exists bool
|
||||||
_, closer, err := c.db.Get(key)
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists)
|
||||||
if err != nil {
|
return err == nil && exists
|
||||||
return false
|
|
||||||
}
|
|
||||||
closer.Close()
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetUncrawledDomains returns all domains with status "uncrawled"
|
// getDomain retrieves a domain from SQLite
|
||||||
func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) {
|
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||||
var domains []*Domain
|
domain := &Domain{}
|
||||||
|
var lastCrawledAt sql.NullTime
|
||||||
|
var lastError sql.NullString
|
||||||
|
|
||||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
err := c.db.QueryRow(`
|
||||||
LowerBound: []byte("domain:"),
|
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||||
UpperBound: []byte("domain:\xff"),
|
FROM domains WHERE host = ?
|
||||||
})
|
`, normalizeHost(host)).Scan(
|
||||||
|
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
|
||||||
|
&domain.FeedsFound, &lastError, &domain.TLD,
|
||||||
|
)
|
||||||
|
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
|
||||||
|
|
||||||
for iter.First(); iter.Valid(); iter.Next() {
|
if lastCrawledAt.Valid {
|
||||||
var domain Domain
|
domain.LastCrawledAt = lastCrawledAt.Time
|
||||||
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
|
}
|
||||||
continue
|
if lastError.Valid {
|
||||||
}
|
domain.LastError = lastError.String
|
||||||
if domain.Status == "uncrawled" {
|
|
||||||
domains = append(domains, &domain)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := iter.Error(); err != nil {
|
return domain, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetUncheckedDomains returns all domains with status "unchecked"
|
||||||
|
func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||||
|
FROM domains WHERE status = 'unchecked'
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
return domains, nil
|
return c.scanDomains(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
|
||||||
|
func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||||
|
FROM domains WHERE status = 'unchecked'
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT ?
|
||||||
|
`, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return c.scanDomains(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
// scanDomains is a helper to scan multiple domain rows
|
||||||
|
func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
|
||||||
|
var domains []*Domain
|
||||||
|
for rows.Next() {
|
||||||
|
domain := &Domain{}
|
||||||
|
var lastCrawledAt sql.NullTime
|
||||||
|
var lastError sql.NullString
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
|
||||||
|
&domain.FeedsFound, &lastError, &domain.TLD,
|
||||||
|
); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if lastCrawledAt.Valid {
|
||||||
|
domain.LastCrawledAt = lastCrawledAt.Time
|
||||||
|
}
|
||||||
|
if lastError.Valid {
|
||||||
|
domain.LastError = lastError.String
|
||||||
|
}
|
||||||
|
|
||||||
|
domains = append(domains, domain)
|
||||||
|
}
|
||||||
|
|
||||||
|
return domains, rows.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
// markDomainCrawled updates a domain's status after crawling
|
// markDomainCrawled updates a domain's status after crawling
|
||||||
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
|
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
|
||||||
domain, err := c.getDomain(host)
|
status := "checked"
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if domain == nil {
|
|
||||||
return fmt.Errorf("domain not found: %s", host)
|
|
||||||
}
|
|
||||||
|
|
||||||
domain.LastCrawledAt = time.Now()
|
|
||||||
domain.FeedsFound = feedsFound
|
|
||||||
if lastError != "" {
|
if lastError != "" {
|
||||||
domain.Status = "error"
|
status = "error"
|
||||||
domain.LastError = lastError
|
|
||||||
} else {
|
|
||||||
domain.Status = "crawled"
|
|
||||||
domain.LastError = ""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return c.saveDomain(domain)
|
var err error
|
||||||
|
if lastError != "" {
|
||||||
|
_, err = c.db.Exec(`
|
||||||
|
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ?
|
||||||
|
WHERE host = ?
|
||||||
|
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
|
||||||
|
} else {
|
||||||
|
_, err = c.db.Exec(`
|
||||||
|
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL
|
||||||
|
WHERE host = ?
|
||||||
|
`, status, time.Now(), feedsFound, normalizeHost(host))
|
||||||
|
}
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetDomainCount returns the total number of domains in the database
|
// GetDomainCount returns the total number of domains in the database
|
||||||
func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) {
|
func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
|
||||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total)
|
||||||
LowerBound: []byte("domain:"),
|
|
||||||
UpperBound: []byte("domain:\xff"),
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, 0, err
|
return 0, 0, err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked)
|
||||||
|
return total, unchecked, err
|
||||||
for iter.First(); iter.Valid(); iter.Next() {
|
|
||||||
total++
|
|
||||||
var domain Domain
|
|
||||||
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if domain.Status == "uncrawled" {
|
|
||||||
uncrawled++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := iter.Error(); err != nil {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return total, uncrawled, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled"
|
// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
|
||||||
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
|
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -158,6 +189,110 @@ func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported in
|
|||||||
return c.parseAndStoreDomains(file, limit)
|
return c.parseAndStoreDomains(file, limit)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ImportDomainsInBackground starts domain import in a background goroutine
|
||||||
|
func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||||
|
go func() {
|
||||||
|
file, err := os.Open(filename)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Failed to open vertices file: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
var bodyReader io.Reader
|
||||||
|
|
||||||
|
bufReader := bufio.NewReader(file)
|
||||||
|
peekBytes, err := bufReader.Peek(2)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
fmt.Printf("Failed to peek at file: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
|
||||||
|
gzReader, err := gzip.NewReader(bufReader)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Failed to create gzip reader: %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer gzReader.Close()
|
||||||
|
bodyReader = gzReader
|
||||||
|
} else {
|
||||||
|
bodyReader = bufReader
|
||||||
|
}
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(bodyReader)
|
||||||
|
buf := make([]byte, 0, 64*1024)
|
||||||
|
scanner.Buffer(buf, 1024*1024)
|
||||||
|
|
||||||
|
const batchSize = 10000
|
||||||
|
now := time.Now()
|
||||||
|
nowStr := now.Format("2006-01-02 15:04:05")
|
||||||
|
totalImported := 0
|
||||||
|
batchCount := 0
|
||||||
|
|
||||||
|
type domainEntry struct {
|
||||||
|
host string
|
||||||
|
tld string
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Read and canonicalize batch
|
||||||
|
var domains []domainEntry
|
||||||
|
for len(domains) < batchSize && scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
parts := strings.Split(line, "\t")
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
reverseHostName := strings.TrimSpace(parts[1])
|
||||||
|
if reverseHostName != "" {
|
||||||
|
host := normalizeHost(reverseHost(reverseHostName))
|
||||||
|
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(domains) == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build bulk INSERT statement
|
||||||
|
var sb strings.Builder
|
||||||
|
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
|
||||||
|
args := make([]interface{}, 0, len(domains)*4)
|
||||||
|
for i, d := range domains {
|
||||||
|
if i > 0 {
|
||||||
|
sb.WriteString(",")
|
||||||
|
}
|
||||||
|
sb.WriteString("(?, 'unchecked', ?, ?)")
|
||||||
|
args = append(args, d.host, nowStr, d.tld)
|
||||||
|
}
|
||||||
|
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
|
||||||
|
|
||||||
|
// Execute bulk insert
|
||||||
|
result, err := c.db.Exec(sb.String(), args...)
|
||||||
|
imported := 0
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Bulk insert error: %v\n", err)
|
||||||
|
} else {
|
||||||
|
rowsAffected, _ := result.RowsAffected()
|
||||||
|
imported = int(rowsAffected)
|
||||||
|
}
|
||||||
|
|
||||||
|
batchCount++
|
||||||
|
totalImported += imported
|
||||||
|
atomic.AddInt32(&c.domainsImported, int32(imported))
|
||||||
|
|
||||||
|
// Wait 1 second before the next batch
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
fmt.Printf("Error reading vertices file: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Background import complete: %d domains imported\n", totalImported)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
|
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
|
||||||
var bodyReader io.Reader
|
var bodyReader io.Reader
|
||||||
|
|
||||||
@@ -183,39 +318,63 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
|||||||
scanner.Buffer(buf, 1024*1024)
|
scanner.Buffer(buf, 1024*1024)
|
||||||
|
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
|
nowStr := now.Format("2006-01-02 15:04:05")
|
||||||
count := 0
|
count := 0
|
||||||
|
const batchSize = 1000
|
||||||
|
|
||||||
for scanner.Scan() {
|
type domainEntry struct {
|
||||||
if limit > 0 && count >= limit {
|
host string
|
||||||
|
tld string
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Read and canonicalize batch
|
||||||
|
var domains []domainEntry
|
||||||
|
for len(domains) < batchSize && scanner.Scan() {
|
||||||
|
if limit > 0 && count >= limit {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
line := scanner.Text()
|
||||||
|
parts := strings.Split(line, "\t")
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
reverseHostName := strings.TrimSpace(parts[1])
|
||||||
|
if reverseHostName != "" {
|
||||||
|
host := normalizeHost(reverseHost(reverseHostName))
|
||||||
|
domains = append(domains, domainEntry{host: host, tld: getTLD(host)})
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(domains) == 0 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
line := scanner.Text()
|
// Build bulk INSERT statement
|
||||||
parts := strings.Split(line, "\t")
|
var sb strings.Builder
|
||||||
if len(parts) >= 2 {
|
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
|
||||||
reverseHostName := strings.TrimSpace(parts[1])
|
args := make([]interface{}, 0, len(domains)*4)
|
||||||
if reverseHostName != "" {
|
for i, d := range domains {
|
||||||
host := normalizeHost(reverseHost(reverseHostName))
|
if i > 0 {
|
||||||
count++
|
sb.WriteString(",")
|
||||||
|
|
||||||
// Skip if domain already exists
|
|
||||||
if c.domainExists(host) {
|
|
||||||
skipped++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store new domain as uncrawled
|
|
||||||
domain := &Domain{
|
|
||||||
Host: host,
|
|
||||||
Status: "uncrawled",
|
|
||||||
DiscoveredAt: now,
|
|
||||||
TLD: getTLD(host),
|
|
||||||
}
|
|
||||||
if err := c.saveDomain(domain); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
imported++
|
|
||||||
}
|
}
|
||||||
|
sb.WriteString("(?, 'unchecked', ?, ?)")
|
||||||
|
args = append(args, d.host, nowStr, d.tld)
|
||||||
|
}
|
||||||
|
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
|
||||||
|
|
||||||
|
// Execute bulk insert
|
||||||
|
result, execErr := c.db.Exec(sb.String(), args...)
|
||||||
|
if execErr != nil {
|
||||||
|
skipped += len(domains)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rowsAffected, _ := result.RowsAffected()
|
||||||
|
imported += int(rowsAffected)
|
||||||
|
skipped += len(domains) - int(rowsAffected)
|
||||||
|
|
||||||
|
if limit > 0 && count >= limit {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -225,3 +384,18 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
|||||||
|
|
||||||
return imported, skipped, nil
|
return imported, skipped, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper functions for SQL null handling
|
||||||
|
func nullTime(t time.Time) sql.NullTime {
|
||||||
|
if t.IsZero() {
|
||||||
|
return sql.NullTime{}
|
||||||
|
}
|
||||||
|
return sql.NullTime{Time: t, Valid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
func nullString(s string) sql.NullString {
|
||||||
|
if s == "" {
|
||||||
|
return sql.NullString{}
|
||||||
|
}
|
||||||
|
return sql.NullString{String: s, Valid: true}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,15 +1,86 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"database/sql"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/cockroachdb/pebble"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// shouldSkipFeed checks if a feed URL should be filtered out
|
||||||
|
// Returns true (and a reason) if the feed should be skipped
|
||||||
|
func shouldSkipFeed(feedURL string) (bool, string) {
|
||||||
|
lower := strings.ToLower(feedURL)
|
||||||
|
|
||||||
|
// Skip explicit comment feeds
|
||||||
|
if strings.Contains(lower, "/comment") {
|
||||||
|
return true, "comment feed"
|
||||||
|
}
|
||||||
|
|
||||||
|
u, err := url.Parse(feedURL)
|
||||||
|
if err != nil {
|
||||||
|
return false, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
||||||
|
|
||||||
|
// Skip category/tag feeds
|
||||||
|
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
|
||||||
|
for _, pattern := range categoryPatterns {
|
||||||
|
if strings.Contains(path, pattern) {
|
||||||
|
return true, "category/tag feed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for article comment feeds (path ending in /feed with content before it)
|
||||||
|
if strings.HasSuffix(path, "/feed") {
|
||||||
|
basePath := strings.TrimSuffix(path, "/feed")
|
||||||
|
basePath = strings.Trim(basePath, "/")
|
||||||
|
|
||||||
|
if basePath == "" {
|
||||||
|
return false, "" // Just /feed - legitimate main feed
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if path contains date patterns (likely article)
|
||||||
|
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
||||||
|
return true, "article feed (date pattern)"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if path has multiple segments (likely article or nested content)
|
||||||
|
segments := strings.Split(basePath, "/")
|
||||||
|
if len(segments) >= 2 {
|
||||||
|
return true, "article feed (nested path)"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if single segment looks like an article slug (contains hyphens, is long)
|
||||||
|
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
|
||||||
|
return true, "article feed (slug pattern)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Item represents an individual entry/article from a feed
|
||||||
|
type Item struct {
|
||||||
|
ID int64 `json:"id,omitempty"`
|
||||||
|
FeedURL string `json:"feed_url"`
|
||||||
|
GUID string `json:"guid,omitempty"`
|
||||||
|
Title string `json:"title,omitempty"`
|
||||||
|
Link string `json:"link,omitempty"`
|
||||||
|
Description string `json:"description,omitempty"`
|
||||||
|
Content string `json:"content,omitempty"`
|
||||||
|
Author string `json:"author,omitempty"`
|
||||||
|
PubDate time.Time `json:"pub_date,omitempty"`
|
||||||
|
DiscoveredAt time.Time `json:"discovered_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// Feed represents a discovered RSS/Atom feed with metadata
|
// Feed represents a discovered RSS/Atom feed with metadata
|
||||||
type Feed struct {
|
type Feed struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
@@ -50,99 +121,548 @@ type Feed struct {
|
|||||||
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
||||||
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
||||||
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
||||||
|
|
||||||
|
// Adaptive check interval
|
||||||
|
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
||||||
}
|
}
|
||||||
|
|
||||||
// saveFeed stores a feed in PebbleDB
|
// saveFeed stores a feed in SQLite
|
||||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||||
data, err := json.Marshal(feed)
|
_, err := c.db.Exec(`
|
||||||
if err != nil {
|
INSERT INTO feeds (
|
||||||
return fmt.Errorf("failed to marshal feed: %v", err)
|
url, type, title, description, language, siteUrl,
|
||||||
}
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||||
|
etag, lastModified,
|
||||||
key := []byte("feed:" + feed.URL)
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
return c.db.Set(key, data, pebble.Sync)
|
status, errorCount, lastError, lastErrorAt,
|
||||||
|
sourceUrl, sourceHost, tld,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||||
|
noUpdate
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
|
type = excluded.type,
|
||||||
|
title = excluded.title,
|
||||||
|
description = excluded.description,
|
||||||
|
language = excluded.language,
|
||||||
|
siteUrl = excluded.siteUrl,
|
||||||
|
lastCrawledAt = excluded.lastCrawledAt,
|
||||||
|
nextCrawlAt = excluded.nextCrawlAt,
|
||||||
|
lastBuildDate = excluded.lastBuildDate,
|
||||||
|
etag = excluded.etag,
|
||||||
|
lastModified = excluded.lastModified,
|
||||||
|
ttlMinutes = excluded.ttlMinutes,
|
||||||
|
updatePeriod = excluded.updatePeriod,
|
||||||
|
updateFreq = excluded.updateFreq,
|
||||||
|
status = excluded.status,
|
||||||
|
errorCount = excluded.errorCount,
|
||||||
|
lastError = excluded.lastError,
|
||||||
|
lastErrorAt = excluded.lastErrorAt,
|
||||||
|
itemCount = excluded.itemCount,
|
||||||
|
avgPostFreqHrs = excluded.avgPostFreqHrs,
|
||||||
|
oldestItemDate = excluded.oldestItemDate,
|
||||||
|
newestItemDate = excluded.newestItemDate,
|
||||||
|
noUpdate = excluded.noUpdate
|
||||||
|
`,
|
||||||
|
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
|
||||||
|
nullString(feed.Language), nullString(feed.SiteURL),
|
||||||
|
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
|
||||||
|
nullString(feed.ETag), nullString(feed.LastModified),
|
||||||
|
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
|
||||||
|
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
|
||||||
|
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
|
||||||
|
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
|
||||||
|
feed.NoUpdate,
|
||||||
|
)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// getFeed retrieves a feed from PebbleDB
|
// getFeed retrieves a feed from SQLite
|
||||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||||
key := []byte("feed:" + normalizeURL(feedURL))
|
feed := &Feed{}
|
||||||
data, closer, err := c.db.Get(key)
|
var title, description, language, siteURL sql.NullString
|
||||||
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||||
|
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||||
|
var avgPostFreqHrs sql.NullFloat64
|
||||||
|
|
||||||
|
err := c.db.QueryRow(`
|
||||||
|
SELECT url, type, title, description, language, siteUrl,
|
||||||
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||||
|
etag, lastModified,
|
||||||
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
|
status, errorCount, lastError, lastErrorAt,
|
||||||
|
sourceUrl, sourceHost, tld,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||||
|
noUpdate
|
||||||
|
FROM feeds WHERE url = ?
|
||||||
|
`, normalizeURL(feedURL)).Scan(
|
||||||
|
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||||
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||||
|
&etag, &lastModified,
|
||||||
|
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||||
|
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
||||||
|
&sourceURL, &sourceHost, &tld,
|
||||||
|
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||||
|
&feed.NoUpdate,
|
||||||
|
)
|
||||||
|
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == pebble.ErrNotFound {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer closer.Close()
|
|
||||||
|
|
||||||
var feed Feed
|
// Handle nullable fields
|
||||||
if err := json.Unmarshal(data, &feed); err != nil {
|
if title.Valid {
|
||||||
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
|
feed.Title = title.String
|
||||||
}
|
}
|
||||||
return &feed, nil
|
if description.Valid {
|
||||||
|
feed.Description = description.String
|
||||||
|
}
|
||||||
|
if language.Valid {
|
||||||
|
feed.Language = language.String
|
||||||
|
}
|
||||||
|
if siteURL.Valid {
|
||||||
|
feed.SiteURL = siteURL.String
|
||||||
|
}
|
||||||
|
if lastCrawledAt.Valid {
|
||||||
|
feed.LastCrawledAt = lastCrawledAt.Time
|
||||||
|
}
|
||||||
|
if nextCrawlAt.Valid {
|
||||||
|
feed.NextCrawlAt = nextCrawlAt.Time
|
||||||
|
}
|
||||||
|
if lastBuildDate.Valid {
|
||||||
|
feed.LastBuildDate = lastBuildDate.Time
|
||||||
|
}
|
||||||
|
if etag.Valid {
|
||||||
|
feed.ETag = etag.String
|
||||||
|
}
|
||||||
|
if lastModified.Valid {
|
||||||
|
feed.LastModified = lastModified.String
|
||||||
|
}
|
||||||
|
if updatePeriod.Valid {
|
||||||
|
feed.UpdatePeriod = updatePeriod.String
|
||||||
|
}
|
||||||
|
if lastError.Valid {
|
||||||
|
feed.LastError = lastError.String
|
||||||
|
}
|
||||||
|
if lastErrorAt.Valid {
|
||||||
|
feed.LastErrorAt = lastErrorAt.Time
|
||||||
|
}
|
||||||
|
if sourceURL.Valid {
|
||||||
|
feed.SourceURL = sourceURL.String
|
||||||
|
}
|
||||||
|
if sourceHost.Valid {
|
||||||
|
feed.SourceHost = sourceHost.String
|
||||||
|
}
|
||||||
|
if tld.Valid {
|
||||||
|
feed.TLD = tld.String
|
||||||
|
}
|
||||||
|
if avgPostFreqHrs.Valid {
|
||||||
|
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
||||||
|
}
|
||||||
|
if oldestItemDate.Valid {
|
||||||
|
feed.OldestItemDate = oldestItemDate.Time
|
||||||
|
}
|
||||||
|
if newestItemDate.Valid {
|
||||||
|
feed.NewestItemDate = newestItemDate.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
return feed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// feedExists checks if a feed URL already exists in the database
|
// feedExists checks if a feed URL already exists in the database
|
||||||
func (c *Crawler) feedExists(feedURL string) bool {
|
func (c *Crawler) feedExists(feedURL string) bool {
|
||||||
key := []byte("feed:" + normalizeURL(feedURL))
|
var exists bool
|
||||||
_, closer, err := c.db.Get(key)
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
|
||||||
if err != nil {
|
return err == nil && exists
|
||||||
return false
|
|
||||||
}
|
|
||||||
closer.Close()
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAllFeeds returns all feeds from the database
|
// GetAllFeeds returns all feeds from the database
|
||||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||||
var feeds []*Feed
|
rows, err := c.db.Query(`
|
||||||
|
SELECT url, type, title, description, language, siteUrl,
|
||||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||||
LowerBound: []byte("feed:"),
|
etag, lastModified,
|
||||||
UpperBound: []byte("feed:\xff"),
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
})
|
status, errorCount, lastError, lastErrorAt,
|
||||||
|
sourceUrl, sourceHost, tld,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||||
|
noUpdate
|
||||||
|
FROM feeds
|
||||||
|
`)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
defer rows.Close()
|
||||||
|
|
||||||
for iter.First(); iter.Valid(); iter.Next() {
|
return scanFeeds(rows)
|
||||||
var feed Feed
|
|
||||||
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
feeds = append(feeds, &feed)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := iter.Error(); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return feeds, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetFeedCount returns the total number of feeds in the database
|
// GetFeedCount returns the total number of feeds in the database
|
||||||
func (c *Crawler) GetFeedCount() (int, error) {
|
func (c *Crawler) GetFeedCount() (int, error) {
|
||||||
count := 0
|
var count int
|
||||||
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
|
||||||
|
return count, err
|
||||||
|
}
|
||||||
|
|
||||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
// GetFeedCountByHost returns the number of feeds for a specific host
|
||||||
LowerBound: []byte("feed:"),
|
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
||||||
UpperBound: []byte("feed:\xff"),
|
var count int
|
||||||
})
|
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
|
||||||
|
return count, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
|
||||||
|
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT url, type, title, description, language, siteUrl,
|
||||||
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||||
|
etag, lastModified,
|
||||||
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
|
status, errorCount, lastError, lastErrorAt,
|
||||||
|
sourceUrl, sourceHost, tld,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||||
|
noUpdate
|
||||||
|
FROM feeds
|
||||||
|
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT ?
|
||||||
|
`, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanFeeds(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetFeedsByHost returns all feeds from a specific host
|
||||||
|
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT url, type, title, description, language, siteUrl,
|
||||||
|
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||||
|
etag, lastModified,
|
||||||
|
ttlMinutes, updatePeriod, updateFreq,
|
||||||
|
status, errorCount, lastError, lastErrorAt,
|
||||||
|
sourceUrl, sourceHost, tld,
|
||||||
|
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||||
|
noUpdate
|
||||||
|
FROM feeds WHERE sourceHost = ?
|
||||||
|
`, host)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanFeeds(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchFeeds performs a full-text search on feeds
|
||||||
|
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
|
||||||
|
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
|
||||||
|
f.etag, f.lastModified,
|
||||||
|
f.ttlMinutes, f.updatePeriod, f.updateFreq,
|
||||||
|
f.status, f.errorCount, f.lastError, f.lastErrorAt,
|
||||||
|
f.sourceUrl, f.sourceHost, f.tld,
|
||||||
|
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
|
||||||
|
f.noUpdate
|
||||||
|
FROM feeds f
|
||||||
|
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
||||||
|
WHERE feeds_fts MATCH ?
|
||||||
|
`, query)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanFeeds(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
// scanFeeds is a helper to scan multiple feed rows
|
||||||
|
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||||
|
var feeds []*Feed
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
feed := &Feed{}
|
||||||
|
var title, description, language, siteURL sql.NullString
|
||||||
|
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||||
|
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||||
|
var avgPostFreqHrs sql.NullFloat64
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||||
|
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||||
|
&etag, &lastModified,
|
||||||
|
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||||
|
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
||||||
|
&sourceURL, &sourceHost, &tld,
|
||||||
|
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||||
|
&feed.NoUpdate,
|
||||||
|
); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle nullable fields
|
||||||
|
if title.Valid {
|
||||||
|
feed.Title = title.String
|
||||||
|
}
|
||||||
|
if description.Valid {
|
||||||
|
feed.Description = description.String
|
||||||
|
}
|
||||||
|
if language.Valid {
|
||||||
|
feed.Language = language.String
|
||||||
|
}
|
||||||
|
if siteURL.Valid {
|
||||||
|
feed.SiteURL = siteURL.String
|
||||||
|
}
|
||||||
|
if lastCrawledAt.Valid {
|
||||||
|
feed.LastCrawledAt = lastCrawledAt.Time
|
||||||
|
}
|
||||||
|
if nextCrawlAt.Valid {
|
||||||
|
feed.NextCrawlAt = nextCrawlAt.Time
|
||||||
|
}
|
||||||
|
if lastBuildDate.Valid {
|
||||||
|
feed.LastBuildDate = lastBuildDate.Time
|
||||||
|
}
|
||||||
|
if etag.Valid {
|
||||||
|
feed.ETag = etag.String
|
||||||
|
}
|
||||||
|
if lastModified.Valid {
|
||||||
|
feed.LastModified = lastModified.String
|
||||||
|
}
|
||||||
|
if updatePeriod.Valid {
|
||||||
|
feed.UpdatePeriod = updatePeriod.String
|
||||||
|
}
|
||||||
|
if lastError.Valid {
|
||||||
|
feed.LastError = lastError.String
|
||||||
|
}
|
||||||
|
if lastErrorAt.Valid {
|
||||||
|
feed.LastErrorAt = lastErrorAt.Time
|
||||||
|
}
|
||||||
|
if sourceURL.Valid {
|
||||||
|
feed.SourceURL = sourceURL.String
|
||||||
|
}
|
||||||
|
if sourceHost.Valid {
|
||||||
|
feed.SourceHost = sourceHost.String
|
||||||
|
}
|
||||||
|
if tld.Valid {
|
||||||
|
feed.TLD = tld.String
|
||||||
|
}
|
||||||
|
if avgPostFreqHrs.Valid {
|
||||||
|
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
||||||
|
}
|
||||||
|
if oldestItemDate.Valid {
|
||||||
|
feed.OldestItemDate = oldestItemDate.Time
|
||||||
|
}
|
||||||
|
if newestItemDate.Valid {
|
||||||
|
feed.NewestItemDate = newestItemDate.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
feeds = append(feeds, feed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return feeds, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
|
||||||
|
func (c *Crawler) saveItem(item *Item) error {
|
||||||
|
_, err := c.db.Exec(`
|
||||||
|
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||||
|
title = excluded.title,
|
||||||
|
link = excluded.link,
|
||||||
|
description = excluded.description,
|
||||||
|
content = excluded.content,
|
||||||
|
author = excluded.author,
|
||||||
|
pubDate = excluded.pubDate,
|
||||||
|
updatedAt = excluded.updatedAt
|
||||||
|
`,
|
||||||
|
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||||
|
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||||
|
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// saveItems stores multiple items efficiently
|
||||||
|
func (c *Crawler) saveItems(items []*Item) error {
|
||||||
|
if len(items) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
tx, err := c.db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
stmt, err := tx.Prepare(`
|
||||||
|
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||||
|
title = excluded.title,
|
||||||
|
link = excluded.link,
|
||||||
|
description = excluded.description,
|
||||||
|
content = excluded.content,
|
||||||
|
author = excluded.author,
|
||||||
|
pubDate = excluded.pubDate,
|
||||||
|
updatedAt = excluded.updatedAt
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer stmt.Close()
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
|
if item == nil || item.GUID == "" {
|
||||||
|
continue // Skip nil items or items without GUID
|
||||||
|
}
|
||||||
|
_, err := stmt.Exec(
|
||||||
|
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||||
|
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||||
|
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
continue // Skip failed items
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetItemsByFeed returns all items for a specific feed
|
||||||
|
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
|
||||||
|
FROM items
|
||||||
|
WHERE feedUrl = ?
|
||||||
|
ORDER BY pubDate DESC
|
||||||
|
LIMIT ?
|
||||||
|
`, feedURL, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var items []*Item
|
||||||
|
for rows.Next() {
|
||||||
|
item := &Item{}
|
||||||
|
var guid, title, link, description, content, author sql.NullString
|
||||||
|
var pubDate, updatedAt sql.NullTime
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||||
|
&description, &content, &author, &pubDate,
|
||||||
|
&item.DiscoveredAt, &updatedAt,
|
||||||
|
); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if guid.Valid {
|
||||||
|
item.GUID = guid.String
|
||||||
|
}
|
||||||
|
if title.Valid {
|
||||||
|
item.Title = title.String
|
||||||
|
}
|
||||||
|
if link.Valid {
|
||||||
|
item.Link = link.String
|
||||||
|
}
|
||||||
|
if description.Valid {
|
||||||
|
item.Description = description.String
|
||||||
|
}
|
||||||
|
if content.Valid {
|
||||||
|
item.Content = content.String
|
||||||
|
}
|
||||||
|
if author.Valid {
|
||||||
|
item.Author = author.String
|
||||||
|
}
|
||||||
|
if pubDate.Valid {
|
||||||
|
item.PubDate = pubDate.Time
|
||||||
|
}
|
||||||
|
if updatedAt.Valid {
|
||||||
|
item.UpdatedAt = updatedAt.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
items = append(items, item)
|
||||||
|
}
|
||||||
|
|
||||||
|
return items, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchItems performs a full-text search on items
|
||||||
|
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||||
|
rows, err := c.db.Query(`
|
||||||
|
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
|
||||||
|
FROM items i
|
||||||
|
JOIN items_fts fts ON i.id = fts.rowid
|
||||||
|
WHERE items_fts MATCH ?
|
||||||
|
ORDER BY i.pubDate DESC
|
||||||
|
LIMIT ?
|
||||||
|
`, query, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var items []*Item
|
||||||
|
for rows.Next() {
|
||||||
|
item := &Item{}
|
||||||
|
var guid, title, link, description, content, author sql.NullString
|
||||||
|
var pubDate, updatedAt sql.NullTime
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||||
|
&description, &content, &author, &pubDate,
|
||||||
|
&item.DiscoveredAt, &updatedAt,
|
||||||
|
); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if guid.Valid {
|
||||||
|
item.GUID = guid.String
|
||||||
|
}
|
||||||
|
if title.Valid {
|
||||||
|
item.Title = title.String
|
||||||
|
}
|
||||||
|
if link.Valid {
|
||||||
|
item.Link = link.String
|
||||||
|
}
|
||||||
|
if description.Valid {
|
||||||
|
item.Description = description.String
|
||||||
|
}
|
||||||
|
if content.Valid {
|
||||||
|
item.Content = content.String
|
||||||
|
}
|
||||||
|
if author.Valid {
|
||||||
|
item.Author = author.String
|
||||||
|
}
|
||||||
|
if pubDate.Valid {
|
||||||
|
item.PubDate = pubDate.Time
|
||||||
|
}
|
||||||
|
if updatedAt.Valid {
|
||||||
|
item.UpdatedAt = updatedAt.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
items = append(items, item)
|
||||||
|
}
|
||||||
|
|
||||||
|
return items, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// CleanupOldItems removes items older than 12 months
|
||||||
|
func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||||
|
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
||||||
|
result, err := c.db.Exec(`
|
||||||
|
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
|
||||||
|
`, cutoff)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
defer iter.Close()
|
return result.RowsAffected()
|
||||||
|
|
||||||
for iter.First(); iter.Valid(); iter.Next() {
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := iter.Error(); err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return count, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// processFeed parses and stores a feed with full metadata
|
// processFeed parses and stores a feed with full metadata
|
||||||
@@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
|||||||
LastModified: headers.Get("Last-Modified"),
|
LastModified: headers.Get("Last-Modified"),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse feed-specific metadata
|
// Parse feed-specific metadata and items
|
||||||
|
var items []*Item
|
||||||
switch feedType {
|
switch feedType {
|
||||||
case "rss":
|
case "rss":
|
||||||
c.parseRSSMetadata(body, feed)
|
items = c.parseRSSMetadata(body, feed)
|
||||||
case "atom":
|
case "atom":
|
||||||
c.parseAtomMetadata(body, feed)
|
items = c.parseAtomMetadata(body, feed)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate next crawl time
|
// Calculate next crawl time
|
||||||
@@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
|||||||
if err := c.saveFeed(feed); err != nil {
|
if err := c.saveFeed(feed); err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save items
|
||||||
|
if len(items) > 0 {
|
||||||
|
c.saveItems(items)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// addFeed adds a discovered feed URL (not yet fetched)
|
// addFeed adds a discovered feed URL (not yet fetched)
|
||||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||||
if strings.Contains(feedURL, "/comment") {
|
// Skip comment, category, and article feeds
|
||||||
|
if skip, _ := shouldSkipFeed(feedURL); skip {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||||
|
// Returns: changed (bool), error
|
||||||
|
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||||
|
atomic.AddInt32(&c.feedsChecked, 1)
|
||||||
|
|
||||||
|
// Try different scheme/www combinations since we store URLs without scheme
|
||||||
|
urlVariants := []string{
|
||||||
|
"https://" + feed.URL,
|
||||||
|
"http://" + feed.URL,
|
||||||
|
"https://www." + feed.URL,
|
||||||
|
"http://www." + feed.URL,
|
||||||
|
}
|
||||||
|
|
||||||
|
var resp *http.Response
|
||||||
|
var err error
|
||||||
|
var successURL string
|
||||||
|
|
||||||
|
for _, tryURL := range urlVariants {
|
||||||
|
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||||
|
if reqErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
req.Header.Set("User-Agent", c.UserAgent)
|
||||||
|
|
||||||
|
// Add conditional headers if we have them
|
||||||
|
if feed.ETag != "" {
|
||||||
|
req.Header.Set("If-None-Match", feed.ETag)
|
||||||
|
}
|
||||||
|
if feed.LastModified != "" {
|
||||||
|
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err = c.client.Do(req)
|
||||||
|
if err == nil {
|
||||||
|
successURL = tryURL
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = successURL // May be used later for logging/debugging
|
||||||
|
|
||||||
|
// If no request succeeded, resp will be nil
|
||||||
|
if resp == nil {
|
||||||
|
if err == nil {
|
||||||
|
err = fmt.Errorf("all URL variants failed")
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
feed.LastCrawledAt = now
|
||||||
|
feed.ErrorCount++
|
||||||
|
feed.NoUpdate++
|
||||||
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
|
feed.LastError = err.Error()
|
||||||
|
feed.LastErrorAt = now
|
||||||
|
feed.Status = "error"
|
||||||
|
c.saveFeed(feed)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
feed.LastCrawledAt = now
|
||||||
|
|
||||||
|
// 304 Not Modified - feed hasn't changed
|
||||||
|
if resp.StatusCode == http.StatusNotModified {
|
||||||
|
feed.NoUpdate++
|
||||||
|
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||||
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
|
feed.ErrorCount = 0
|
||||||
|
feed.LastError = ""
|
||||||
|
feed.Status = "active"
|
||||||
|
c.saveFeed(feed)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-200 response
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
feed.ErrorCount++
|
||||||
|
feed.NoUpdate++
|
||||||
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
|
feed.LastError = resp.Status
|
||||||
|
feed.LastErrorAt = now
|
||||||
|
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||||
|
feed.Status = "dead"
|
||||||
|
} else {
|
||||||
|
feed.Status = "error"
|
||||||
|
}
|
||||||
|
c.saveFeed(feed)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// 200 OK - feed has new content
|
||||||
|
bodyBytes, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
feed.ErrorCount++
|
||||||
|
feed.NoUpdate++
|
||||||
|
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
|
feed.LastError = err.Error()
|
||||||
|
feed.LastErrorAt = now
|
||||||
|
feed.Status = "error"
|
||||||
|
c.saveFeed(feed)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
body := string(bodyBytes)
|
||||||
|
|
||||||
|
// Update cache headers
|
||||||
|
feed.ETag = resp.Header.Get("ETag")
|
||||||
|
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||||
|
|
||||||
|
// Re-detect type and parse metadata
|
||||||
|
feedType := c.detectFeedType(body)
|
||||||
|
feed.Type = feedType
|
||||||
|
|
||||||
|
var items []*Item
|
||||||
|
switch feedType {
|
||||||
|
case "rss":
|
||||||
|
items = c.parseRSSMetadata(body, feed)
|
||||||
|
case "atom":
|
||||||
|
items = c.parseAtomMetadata(body, feed)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content changed - reset backoff
|
||||||
|
feed.NoUpdate = 0
|
||||||
|
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||||
|
feed.ErrorCount = 0
|
||||||
|
feed.LastError = ""
|
||||||
|
feed.Status = "active"
|
||||||
|
c.saveFeed(feed)
|
||||||
|
|
||||||
|
// Save items
|
||||||
|
if len(items) > 0 {
|
||||||
|
c.saveItems(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,7 +6,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
crawler, err := NewCrawler("feeds.db")
|
// Ensure feeds directory exists
|
||||||
|
if err := os.MkdirAll("feeds", 0755); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error creating feeds directory: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
crawler, err := NewCrawler("feeds/feeds.db")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
|
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
@@ -20,11 +26,24 @@ func main() {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Import domains from vertices file (only adds new ones as "uncrawled")
|
// Initialize stats in background (can be slow with large DBs)
|
||||||
crawler.ImportDomainsFromFile("vertices.txt.gz", 0)
|
go crawler.UpdateStats()
|
||||||
|
|
||||||
// Crawl all uncrawled domains (runs continuously)
|
// Start all loops independently
|
||||||
for {
|
fmt.Println("Starting import, crawl, check, and stats loops...")
|
||||||
crawler.CrawlUncrawledDomains()
|
|
||||||
}
|
// Import loop (background)
|
||||||
|
go crawler.ImportDomainsInBackground("vertices.txt.gz")
|
||||||
|
|
||||||
|
// Check loop (background)
|
||||||
|
go crawler.StartCheckLoop()
|
||||||
|
|
||||||
|
// Stats loop (background) - updates once per minute
|
||||||
|
go crawler.StartStatsLoop()
|
||||||
|
|
||||||
|
// Cleanup loop (background) - removes old items once per hour
|
||||||
|
go crawler.StartCleanupLoop()
|
||||||
|
|
||||||
|
// Crawl loop (foreground - blocks forever)
|
||||||
|
crawler.StartCrawlLoop()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,9 +26,14 @@ type RSSChannel struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RSSItem struct {
|
type RSSItem struct {
|
||||||
Title string `xml:"title"`
|
Title string `xml:"title"`
|
||||||
Link string `xml:"link"`
|
Link string `xml:"link"`
|
||||||
PubDate string `xml:"pubDate"`
|
GUID string `xml:"guid"`
|
||||||
|
Description string `xml:"description"`
|
||||||
|
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||||
|
Author string `xml:"author"`
|
||||||
|
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||||
|
PubDate string `xml:"pubDate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Atom structs for parsing
|
// Atom structs for parsing
|
||||||
@@ -40,10 +45,23 @@ type AtomFeed struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type AtomEntry struct {
|
type AtomEntry struct {
|
||||||
Title string `xml:"title"`
|
ID string `xml:"id"`
|
||||||
Links []AtomLink `xml:"link"`
|
Title string `xml:"title"`
|
||||||
Updated string `xml:"updated"`
|
Links []AtomLink `xml:"link"`
|
||||||
Published string `xml:"published"`
|
Summary string `xml:"summary"`
|
||||||
|
Content AtomContent `xml:"content"`
|
||||||
|
Author AtomAuthor `xml:"author"`
|
||||||
|
Updated string `xml:"updated"`
|
||||||
|
Published string `xml:"published"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type AtomContent struct {
|
||||||
|
Type string `xml:"type,attr"`
|
||||||
|
Value string `xml:",chardata"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type AtomAuthor struct {
|
||||||
|
Name string `xml:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type AtomLink struct {
|
type AtomLink struct {
|
||||||
@@ -52,10 +70,10 @@ type AtomLink struct {
|
|||||||
Type string `xml:"type,attr"`
|
Type string `xml:"type,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
|
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||||
var rss RSS
|
var rss RSS
|
||||||
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
|
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
ch := rss.Channel
|
ch := rss.Channel
|
||||||
@@ -75,16 +93,47 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyze item dates
|
// Parse items
|
||||||
|
now := time.Now()
|
||||||
|
var items []*Item
|
||||||
var dates []time.Time
|
var dates []time.Time
|
||||||
for _, item := range ch.Items {
|
|
||||||
if item.PubDate != "" {
|
for _, rssItem := range ch.Items {
|
||||||
if t, err := parseRSSDate(item.PubDate); err == nil {
|
item := &Item{
|
||||||
|
FeedURL: feed.URL,
|
||||||
|
Title: rssItem.Title,
|
||||||
|
Link: rssItem.Link,
|
||||||
|
Description: rssItem.Description,
|
||||||
|
Content: rssItem.Content,
|
||||||
|
DiscoveredAt: now,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use GUID if available, otherwise use link
|
||||||
|
if rssItem.GUID != "" {
|
||||||
|
item.GUID = rssItem.GUID
|
||||||
|
} else if rssItem.Link != "" {
|
||||||
|
item.GUID = rssItem.Link
|
||||||
|
}
|
||||||
|
|
||||||
|
// Author: prefer author, fall back to dc:creator
|
||||||
|
if rssItem.Author != "" {
|
||||||
|
item.Author = rssItem.Author
|
||||||
|
} else if rssItem.Creator != "" {
|
||||||
|
item.Author = rssItem.Creator
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse pubDate
|
||||||
|
if rssItem.PubDate != "" {
|
||||||
|
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
|
||||||
|
item.PubDate = t
|
||||||
dates = append(dates, t)
|
dates = append(dates, t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
items = append(items, item)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate date stats
|
||||||
if len(dates) > 0 {
|
if len(dates) > 0 {
|
||||||
oldest, newest := dates[0], dates[0]
|
oldest, newest := dates[0], dates[0]
|
||||||
for _, d := range dates {
|
for _, d := range dates {
|
||||||
@@ -103,12 +152,14 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) {
|
|||||||
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return items
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
|
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
|
||||||
var atom AtomFeed
|
var atom AtomFeed
|
||||||
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
|
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
feed.Title = atom.Title
|
feed.Title = atom.Title
|
||||||
@@ -131,20 +182,60 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyze entry dates
|
// Parse entries
|
||||||
|
now := time.Now()
|
||||||
|
var items []*Item
|
||||||
var dates []time.Time
|
var dates []time.Time
|
||||||
|
|
||||||
for _, entry := range atom.Entries {
|
for _, entry := range atom.Entries {
|
||||||
|
item := &Item{
|
||||||
|
FeedURL: feed.URL,
|
||||||
|
Title: entry.Title,
|
||||||
|
Author: entry.Author.Name,
|
||||||
|
DiscoveredAt: now,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ID as GUID
|
||||||
|
if entry.ID != "" {
|
||||||
|
item.GUID = entry.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get link (prefer alternate, fall back to first link)
|
||||||
|
for _, link := range entry.Links {
|
||||||
|
if link.Rel == "" || link.Rel == "alternate" {
|
||||||
|
item.Link = link.Href
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if item.Link == "" && len(entry.Links) > 0 {
|
||||||
|
item.Link = entry.Links[0].Href
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ID as GUID fallback if not set
|
||||||
|
if item.GUID == "" && item.Link != "" {
|
||||||
|
item.GUID = item.Link
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary/Content
|
||||||
|
item.Description = entry.Summary
|
||||||
|
item.Content = entry.Content.Value
|
||||||
|
|
||||||
|
// Parse dates
|
||||||
dateStr := entry.Updated
|
dateStr := entry.Updated
|
||||||
if dateStr == "" {
|
if dateStr == "" {
|
||||||
dateStr = entry.Published
|
dateStr = entry.Published
|
||||||
}
|
}
|
||||||
if dateStr != "" {
|
if dateStr != "" {
|
||||||
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
|
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
|
||||||
|
item.PubDate = t
|
||||||
dates = append(dates, t)
|
dates = append(dates, t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
items = append(items, item)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate date stats
|
||||||
if len(dates) > 0 {
|
if len(dates) > 0 {
|
||||||
oldest, newest := dates[0], dates[0]
|
oldest, newest := dates[0], dates[0]
|
||||||
for _, d := range dates {
|
for _, d := range dates {
|
||||||
@@ -163,6 +254,8 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) {
|
|||||||
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return items
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseRSSDate attempts to parse various RSS date formats
|
// parseRSSDate attempts to parse various RSS date formats
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
body {
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, monospace;
|
||||||
|
background: #0a0a0a;
|
||||||
|
color: #ffffff;
|
||||||
|
padding: 20px;
|
||||||
|
line-height: 1.6;
|
||||||
|
}
|
||||||
|
h1 { color: #ffffff; margin-bottom: 20px; font-size: 24px; }
|
||||||
|
h2 { color: #ffffff; margin: 20px 0 10px; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; }
|
||||||
|
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-bottom: 20px; }
|
||||||
|
.card {
|
||||||
|
background: #151515;
|
||||||
|
border: 1px solid #252525;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 15px;
|
||||||
|
}
|
||||||
|
.stat-value { font-size: 32px; font-weight: bold; color: #ffffff; }
|
||||||
|
.stat-label { font-size: 12px; color: #ffffff; text-transform: uppercase; }
|
||||||
|
.stat-row { display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid #202020; color: #ffffff; }
|
||||||
|
.stat-row:last-child { border-bottom: none; }
|
||||||
|
.progress-bar {
|
||||||
|
background: #202020;
|
||||||
|
border-radius: 4px;
|
||||||
|
height: 8px;
|
||||||
|
margin-top: 10px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
.progress-fill {
|
||||||
|
background: linear-gradient(90deg, #00aa55, #00cc66);
|
||||||
|
height: 100%;
|
||||||
|
transition: width 0.3s;
|
||||||
|
}
|
||||||
|
table { width: 100%; border-collapse: collapse; color: #ffffff; }
|
||||||
|
th, td { text-align: left; padding: 8px; border-bottom: 1px solid #202020; }
|
||||||
|
th { color: #ffffff; font-size: 11px; text-transform: uppercase; }
|
||||||
|
td { font-size: 13px; color: #ffffff; }
|
||||||
|
.type-rss { color: #f90; }
|
||||||
|
.type-atom { color: #09f; }
|
||||||
|
.type-unknown { color: #ffffff; }
|
||||||
|
.url {
|
||||||
|
max-width: 400px;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
white-space: nowrap;
|
||||||
|
color: #4a9eff;
|
||||||
|
}
|
||||||
|
.time { color: #ffffff; font-size: 12px; }
|
||||||
|
.updated { color: #ffffff; font-size: 11px; text-align: right; margin-top: 20px; }
|
||||||
|
|
||||||
|
/* Search */
|
||||||
|
#searchInput:focus { outline: none; border-color: #0af; }
|
||||||
|
#searchInput::placeholder { color: #555; }
|
||||||
|
.search-host { margin-bottom: 10px; }
|
||||||
|
.search-feed:hover { background: #1a1a1a; }
|
||||||
@@ -0,0 +1,519 @@
|
|||||||
|
function initDashboard() {
|
||||||
|
function commaFormat(n) {
|
||||||
|
return n.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ',');
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeHtml(text) {
|
||||||
|
if (text == null) return '';
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.textContent = text;
|
||||||
|
return div.innerHTML;
|
||||||
|
}
|
||||||
|
|
||||||
|
// All domains state
|
||||||
|
let allDomainsOffset = 0;
|
||||||
|
let allDomainsLoading = false;
|
||||||
|
let allDomainsEnd = false;
|
||||||
|
let expandedDomain = null;
|
||||||
|
let expandedFeed = null;
|
||||||
|
const PAGE_SIZE = 100;
|
||||||
|
const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom
|
||||||
|
|
||||||
|
// Search state
|
||||||
|
let searchTimeout = null;
|
||||||
|
let isSearching = false;
|
||||||
|
|
||||||
|
async function loadMoreDomains() {
|
||||||
|
if (allDomainsLoading || allDomainsEnd) return;
|
||||||
|
|
||||||
|
allDomainsLoading = true;
|
||||||
|
const loadingEl = document.getElementById('allDomainsLoading');
|
||||||
|
loadingEl.style.display = 'block';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE);
|
||||||
|
const domains = await response.json();
|
||||||
|
|
||||||
|
if (!domains || domains.length === 0) {
|
||||||
|
allDomainsEnd = true;
|
||||||
|
loadingEl.style.display = 'none';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const container = document.getElementById('allDomains');
|
||||||
|
domains.forEach(d => {
|
||||||
|
const row = document.createElement('div');
|
||||||
|
row.className = 'domain-row';
|
||||||
|
row.innerHTML =
|
||||||
|
'<div class="stat-row" style="cursor: pointer;">' +
|
||||||
|
'<span>' + escapeHtml(d.host) + '</span>' +
|
||||||
|
'<span>' + commaFormat(d.feeds_found) + '</span>' +
|
||||||
|
'</div>' +
|
||||||
|
'<div class="domain-feeds" style="display: none;"></div>';
|
||||||
|
|
||||||
|
row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row));
|
||||||
|
container.appendChild(row);
|
||||||
|
});
|
||||||
|
|
||||||
|
allDomainsOffset += domains.length;
|
||||||
|
loadingEl.style.display = 'none';
|
||||||
|
|
||||||
|
// If we got fewer than PAGE_SIZE, we've reached the end
|
||||||
|
if (domains.length < PAGE_SIZE) {
|
||||||
|
allDomainsEnd = true;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to load domains:', err);
|
||||||
|
} finally {
|
||||||
|
allDomainsLoading = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toggleDomainFeeds(host, rowEl) {
|
||||||
|
const feedsDiv = rowEl.querySelector('.domain-feeds');
|
||||||
|
|
||||||
|
// Close previously expanded domain
|
||||||
|
if (expandedDomain && expandedDomain !== rowEl) {
|
||||||
|
expandedDomain.querySelector('.domain-feeds').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toggle current
|
||||||
|
if (feedsDiv.style.display === 'none') {
|
||||||
|
feedsDiv.style.display = 'block';
|
||||||
|
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feeds...</div>';
|
||||||
|
expandedDomain = rowEl;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host));
|
||||||
|
const feeds = await response.json();
|
||||||
|
|
||||||
|
if (!feeds || feeds.length === 0) {
|
||||||
|
feedsDiv.innerHTML = '<div style="padding: 10px; color: #666;">No feeds found</div>';
|
||||||
|
} else {
|
||||||
|
feedsDiv.innerHTML = '';
|
||||||
|
feeds.forEach(f => {
|
||||||
|
const feedItem = document.createElement('div');
|
||||||
|
feedItem.className = 'feed-item';
|
||||||
|
feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;';
|
||||||
|
feedItem.innerHTML =
|
||||||
|
'<div class="feed-header">' +
|
||||||
|
'<div style="color: #0af;">' + escapeHtml(f.url) + '</div>' +
|
||||||
|
(f.title ? '<div style="color: #888; font-size: 0.9em;">' + escapeHtml(f.title) + '</div>' : '') +
|
||||||
|
'<div style="color: #666; font-size: 0.8em;">' + (f.type || 'unknown') + '</div>' +
|
||||||
|
'</div>' +
|
||||||
|
'<div class="feed-details" style="display: none;"></div>';
|
||||||
|
|
||||||
|
feedItem.querySelector('.feed-header').addEventListener('click', (e) => {
|
||||||
|
e.stopPropagation();
|
||||||
|
toggleFeedInfo(f.url, feedItem);
|
||||||
|
});
|
||||||
|
feedsDiv.appendChild(feedItem);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
feedsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feeds</div>';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
feedsDiv.style.display = 'none';
|
||||||
|
expandedDomain = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toggleFeedInfo(feedUrl, feedItemEl) {
|
||||||
|
const detailsDiv = feedItemEl.querySelector('.feed-details');
|
||||||
|
|
||||||
|
// Close previously expanded feed
|
||||||
|
if (expandedFeed && expandedFeed !== feedItemEl) {
|
||||||
|
expandedFeed.querySelector('.feed-details').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toggle current
|
||||||
|
if (detailsDiv.style.display === 'none') {
|
||||||
|
detailsDiv.style.display = 'block';
|
||||||
|
detailsDiv.innerHTML = '<div style="padding: 10px; color: #666;">Loading feed info...</div>';
|
||||||
|
expandedFeed = feedItemEl;
|
||||||
|
|
||||||
|
// Scroll the feed item to the top of the viewport
|
||||||
|
feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Fetch feed info and items in parallel
|
||||||
|
const [infoResponse, itemsResponse] = await Promise.all([
|
||||||
|
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
|
||||||
|
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50')
|
||||||
|
]);
|
||||||
|
const info = await infoResponse.json();
|
||||||
|
const items = await itemsResponse.json();
|
||||||
|
|
||||||
|
let html = '<div style="padding: 10px; background: #1a1a1a; margin-top: 5px; border-radius: 4px; font-size: 0.85em;">';
|
||||||
|
|
||||||
|
if (info.description) {
|
||||||
|
html += '<div style="margin-bottom: 8px; color: #aaa;">' + escapeHtml(info.description) + '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
html += '<table style="width: 100%; color: #888;">';
|
||||||
|
|
||||||
|
if (info.siteUrl) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.language) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.status) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.itemCount) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.avgPostFreqHrs) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Avg Post Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.ttlMinutes) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">TTL</td><td>' + info.ttlMinutes + ' min</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.updatePeriod) {
|
||||||
|
let updateStr = info.updatePeriod;
|
||||||
|
if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')';
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Update</td><td>' + escapeHtml(updateStr) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.lastBuildDate) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Build</td><td>' + escapeHtml(info.lastBuildDate) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.newestItemDate) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Newest Item</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.oldestItemDate) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Oldest Item</td><td>' + escapeHtml(info.oldestItemDate) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.discoveredAt) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Discovered</td><td>' + escapeHtml(info.discoveredAt) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.lastCrawledAt) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Crawled</td><td>' + escapeHtml(info.lastCrawledAt) + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.errorCount > 0) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Errors</td><td style="color: #f66;">' + info.errorCount + '</td></tr>';
|
||||||
|
}
|
||||||
|
if (info.lastError) {
|
||||||
|
html += '<tr><td style="padding: 2px 8px 2px 0; color: #666;">Last Error</td><td style="color: #f66;">' + escapeHtml(info.lastError) + '</td></tr>';
|
||||||
|
}
|
||||||
|
|
||||||
|
html += '</table>';
|
||||||
|
|
||||||
|
// Display items
|
||||||
|
if (items && items.length > 0) {
|
||||||
|
html += '<div style="margin-top: 12px; border-top: 1px solid #333; padding-top: 8px;">';
|
||||||
|
html += '<div style="color: #666; margin-bottom: 6px; font-weight: bold;">Recent Items (' + items.length + ')</div>';
|
||||||
|
|
||||||
|
items.forEach(item => {
|
||||||
|
html += '<div style="padding: 6px 0; border-bottom: 1px solid #222;">';
|
||||||
|
|
||||||
|
// Title with link
|
||||||
|
if (item.title) {
|
||||||
|
if (item.link) {
|
||||||
|
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.title) + '</a></div>';
|
||||||
|
} else {
|
||||||
|
html += '<div style="color: #ccc;">' + escapeHtml(item.title) + '</div>';
|
||||||
|
}
|
||||||
|
} else if (item.link) {
|
||||||
|
html += '<div><a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none;">' + escapeHtml(item.link) + '</a></div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Metadata line (date, author)
|
||||||
|
let meta = [];
|
||||||
|
if (item.pub_date) {
|
||||||
|
const date = new Date(item.pub_date);
|
||||||
|
meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString());
|
||||||
|
}
|
||||||
|
if (item.author) {
|
||||||
|
meta.push(escapeHtml(item.author));
|
||||||
|
}
|
||||||
|
if (meta.length > 0) {
|
||||||
|
html += '<div style="color: #666; font-size: 0.85em;">' + meta.join(' • ') + '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
html += '</div>';
|
||||||
|
});
|
||||||
|
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
html += '</div>';
|
||||||
|
|
||||||
|
detailsDiv.innerHTML = html;
|
||||||
|
} catch (err) {
|
||||||
|
detailsDiv.innerHTML = '<div style="padding: 10px; color: #f66;">Error loading feed info</div>';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
detailsDiv.style.display = 'none';
|
||||||
|
expandedFeed = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Infinite scroll handler with prefetch (uses window scroll)
|
||||||
|
function setupInfiniteScroll() {
|
||||||
|
window.addEventListener('scroll', () => {
|
||||||
|
// Check if we're near the bottom of the page
|
||||||
|
const scrollBottom = window.scrollY + window.innerHeight;
|
||||||
|
const docHeight = document.documentElement.scrollHeight;
|
||||||
|
const remainingPixels = docHeight - scrollBottom;
|
||||||
|
|
||||||
|
// Prefetch when within 500px of the bottom
|
||||||
|
if (remainingPixels < 500) {
|
||||||
|
loadMoreDomains();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search functionality
|
||||||
|
function setupSearch() {
|
||||||
|
const searchInput = document.getElementById('searchInput');
|
||||||
|
const searchResults = document.getElementById('searchResults');
|
||||||
|
const domainsContainer = document.getElementById('allDomainsContainer');
|
||||||
|
|
||||||
|
if (!searchInput || !searchResults || !domainsContainer) {
|
||||||
|
console.error('Search elements not found');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
searchInput.addEventListener('input', (e) => {
|
||||||
|
const query = e.target.value.trim();
|
||||||
|
|
||||||
|
// Clear previous timeout
|
||||||
|
if (searchTimeout) {
|
||||||
|
clearTimeout(searchTimeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If empty, show domains list
|
||||||
|
if (!query) {
|
||||||
|
searchResults.style.display = 'none';
|
||||||
|
domainsContainer.style.display = 'block';
|
||||||
|
isSearching = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debounce search
|
||||||
|
searchTimeout = setTimeout(() => performSearch(query), 300);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle Enter key
|
||||||
|
searchInput.addEventListener('keydown', (e) => {
|
||||||
|
if (e.key === 'Enter') {
|
||||||
|
const query = e.target.value.trim();
|
||||||
|
if (query) {
|
||||||
|
if (searchTimeout) clearTimeout(searchTimeout);
|
||||||
|
performSearch(query);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function performSearch(query) {
|
||||||
|
const searchResults = document.getElementById('searchResults');
|
||||||
|
const domainsContainer = document.getElementById('allDomainsContainer');
|
||||||
|
|
||||||
|
isSearching = true;
|
||||||
|
domainsContainer.style.display = 'none';
|
||||||
|
searchResults.style.display = 'block';
|
||||||
|
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">Searching...</div>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200');
|
||||||
|
const results = await response.json();
|
||||||
|
|
||||||
|
if (!results || results.length === 0) {
|
||||||
|
searchResults.innerHTML = '<div style="padding: 20px; color: #666; text-align: center;">No results found</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group results by host
|
||||||
|
const byHost = {};
|
||||||
|
results.forEach(r => {
|
||||||
|
const host = r.feed.source_host || 'unknown';
|
||||||
|
if (!byHost[host]) {
|
||||||
|
byHost[host] = [];
|
||||||
|
}
|
||||||
|
byHost[host].push(r);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Render results
|
||||||
|
searchResults.innerHTML = '';
|
||||||
|
|
||||||
|
Object.keys(byHost).sort().forEach(host => {
|
||||||
|
const hostDiv = document.createElement('div');
|
||||||
|
hostDiv.className = 'search-host';
|
||||||
|
|
||||||
|
// Host header
|
||||||
|
const hostHeader = document.createElement('div');
|
||||||
|
hostHeader.className = 'stat-row';
|
||||||
|
hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;';
|
||||||
|
hostHeader.innerHTML = '<span style="color: #0af;">' + escapeHtml(host) + '</span><span style="color: #666;">' + byHost[host].length + ' feed(s)</span>';
|
||||||
|
|
||||||
|
const feedsContainer = document.createElement('div');
|
||||||
|
feedsContainer.style.display = 'block';
|
||||||
|
|
||||||
|
byHost[host].forEach(result => {
|
||||||
|
const feedDiv = document.createElement('div');
|
||||||
|
feedDiv.className = 'search-feed';
|
||||||
|
feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;';
|
||||||
|
|
||||||
|
// Feed header
|
||||||
|
let feedHtml = '<div style="color: #0af; cursor: pointer;" class="feed-url">' + escapeHtml(result.feed.url) + '</div>';
|
||||||
|
if (result.feed.title) {
|
||||||
|
feedHtml += '<div style="color: #aaa; font-size: 0.9em;">' + escapeHtml(result.feed.title) + '</div>';
|
||||||
|
}
|
||||||
|
if (result.feed.description) {
|
||||||
|
feedHtml += '<div style="color: #666; font-size: 0.85em; margin-top: 2px;">' + escapeHtml(result.feed.description.substring(0, 200)) + '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Items
|
||||||
|
if (result.items && result.items.length > 0) {
|
||||||
|
feedHtml += '<div class="search-items" style="margin-top: 8px; padding-left: 10px; border-left: 2px solid #333;">';
|
||||||
|
result.items.forEach(item => {
|
||||||
|
feedHtml += '<div style="padding: 4px 0; border-bottom: 1px solid #1a1a1a;">';
|
||||||
|
if (item.title) {
|
||||||
|
if (item.link) {
|
||||||
|
feedHtml += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #6cf; text-decoration: none;">' + escapeHtml(item.title) + '</a>';
|
||||||
|
} else {
|
||||||
|
feedHtml += '<span style="color: #ccc;">' + escapeHtml(item.title) + '</span>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let meta = [];
|
||||||
|
if (item.pub_date) {
|
||||||
|
meta.push(item.pub_date.substring(0, 10));
|
||||||
|
}
|
||||||
|
if (item.author) {
|
||||||
|
meta.push(escapeHtml(item.author));
|
||||||
|
}
|
||||||
|
if (meta.length > 0) {
|
||||||
|
feedHtml += '<div style="color: #555; font-size: 0.8em;">' + meta.join(' • ') + '</div>';
|
||||||
|
}
|
||||||
|
feedHtml += '</div>';
|
||||||
|
});
|
||||||
|
feedHtml += '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
feedDiv.innerHTML = feedHtml;
|
||||||
|
|
||||||
|
// Click on feed URL to toggle full feed info
|
||||||
|
feedDiv.querySelector('.feed-url').addEventListener('click', () => {
|
||||||
|
toggleSearchFeedInfo(result.feed.url, feedDiv);
|
||||||
|
});
|
||||||
|
|
||||||
|
feedsContainer.appendChild(feedDiv);
|
||||||
|
});
|
||||||
|
|
||||||
|
hostHeader.addEventListener('click', () => {
|
||||||
|
feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none';
|
||||||
|
});
|
||||||
|
|
||||||
|
hostDiv.appendChild(hostHeader);
|
||||||
|
hostDiv.appendChild(feedsContainer);
|
||||||
|
searchResults.appendChild(hostDiv);
|
||||||
|
});
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Search failed:', err);
|
||||||
|
searchResults.innerHTML = '<div style="padding: 20px; color: #f66; text-align: center;">Search failed: ' + escapeHtml(err.message) + '</div>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toggleSearchFeedInfo(feedUrl, feedDiv) {
|
||||||
|
let detailsDiv = feedDiv.querySelector('.feed-details-expanded');
|
||||||
|
|
||||||
|
if (detailsDiv) {
|
||||||
|
detailsDiv.remove();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
detailsDiv = document.createElement('div');
|
||||||
|
detailsDiv.className = 'feed-details-expanded';
|
||||||
|
detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;';
|
||||||
|
detailsDiv.innerHTML = '<div style="color: #666;">Loading feed info...</div>';
|
||||||
|
feedDiv.appendChild(detailsDiv);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const [infoResponse, itemsResponse] = await Promise.all([
|
||||||
|
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
|
||||||
|
fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20')
|
||||||
|
]);
|
||||||
|
const info = await infoResponse.json();
|
||||||
|
const items = await itemsResponse.json();
|
||||||
|
|
||||||
|
let html = '<table style="width: 100%; color: #888; font-size: 0.85em;">';
|
||||||
|
if (info.siteUrl) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Site</td><td>' + escapeHtml(info.siteUrl) + '</td></tr>';
|
||||||
|
if (info.language) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Language</td><td>' + escapeHtml(info.language) + '</td></tr>';
|
||||||
|
if (info.status) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Status</td><td>' + escapeHtml(info.status) + '</td></tr>';
|
||||||
|
if (info.itemCount) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Items</td><td>' + commaFormat(info.itemCount) + '</td></tr>';
|
||||||
|
if (info.avgPostFreqHrs) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Avg Freq</td><td>' + info.avgPostFreqHrs.toFixed(1) + ' hrs</td></tr>';
|
||||||
|
if (info.newestItemDate) html += '<tr><td style="color: #555; padding: 2px 8px 2px 0;">Newest</td><td>' + escapeHtml(info.newestItemDate) + '</td></tr>';
|
||||||
|
html += '</table>';
|
||||||
|
|
||||||
|
if (items && items.length > 0) {
|
||||||
|
html += '<div style="margin-top: 10px; border-top: 1px solid #222; padding-top: 8px;">';
|
||||||
|
html += '<div style="color: #555; margin-bottom: 4px;">All Items (' + items.length + ')</div>';
|
||||||
|
items.forEach(item => {
|
||||||
|
html += '<div style="padding: 3px 0; border-bottom: 1px solid #1a1a1a;">';
|
||||||
|
if (item.title && item.link) {
|
||||||
|
html += '<a href="' + escapeHtml(item.link) + '" target="_blank" style="color: #0af; text-decoration: none; font-size: 0.9em;">' + escapeHtml(item.title) + '</a>';
|
||||||
|
} else if (item.title) {
|
||||||
|
html += '<span style="color: #aaa; font-size: 0.9em;">' + escapeHtml(item.title) + '</span>';
|
||||||
|
}
|
||||||
|
html += '</div>';
|
||||||
|
});
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
detailsDiv.innerHTML = html;
|
||||||
|
} catch (err) {
|
||||||
|
detailsDiv.innerHTML = '<div style="color: #f66;">Failed to load feed info</div>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function updateStats() {
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/stats');
|
||||||
|
const stats = await response.json();
|
||||||
|
|
||||||
|
// Update domain stats
|
||||||
|
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
|
||||||
|
document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains);
|
||||||
|
document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains);
|
||||||
|
document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate);
|
||||||
|
document.getElementById('checkRate').textContent = commaFormat(stats.check_rate);
|
||||||
|
|
||||||
|
// Update progress bar
|
||||||
|
const progress = stats.total_domains > 0
|
||||||
|
? (stats.checked_domains * 100 / stats.total_domains).toFixed(1)
|
||||||
|
: 0;
|
||||||
|
document.getElementById('crawlProgress').style.width = progress + '%';
|
||||||
|
|
||||||
|
// Update feed stats
|
||||||
|
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
|
||||||
|
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
|
||||||
|
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
|
||||||
|
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
|
||||||
|
|
||||||
|
// Update timestamp
|
||||||
|
const updatedAt = new Date(stats.updated_at);
|
||||||
|
document.getElementById('updatedAt').textContent = 'Last updated: ' +
|
||||||
|
updatedAt.toISOString().replace('T', ' ').substring(0, 19);
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to update stats:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize
|
||||||
|
try {
|
||||||
|
setupSearch();
|
||||||
|
} catch (e) {
|
||||||
|
console.error('setupSearch failed:', e);
|
||||||
|
}
|
||||||
|
setupInfiniteScroll();
|
||||||
|
loadMoreDomains();
|
||||||
|
updateStats();
|
||||||
|
setInterval(updateStats, 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
window.onload = initDashboard;
|
||||||
Reference in New Issue
Block a user