Restore working codebase with all methods
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,3 +6,4 @@ feeds/
|
|||||||
.gitignore
|
.gitignore
|
||||||
.claude
|
.claude
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
|
.launch.sh
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||||
|
|
||||||
> **Note:** Always run applications in containers via `docker compose up -d --build` when possible. This ensures proper networking between services (database, traefik, etc.) and matches the production environment.
|
> **IMPORTANT:** Always use `./.launch.sh` to deploy changes. This script updates version numbers in static files (CSS/JS cache busting) before running `docker compose up -d --build`. Never use `docker compose` directly.
|
||||||
|
|
||||||
## Build & Run
|
## Build & Run
|
||||||
|
|
||||||
@@ -84,17 +84,23 @@ PostgreSQL with pgx driver, using connection pooling:
|
|||||||
|
|
||||||
Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`)
|
Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`)
|
||||||
|
|
||||||
### Crawl Logic
|
### Processing Terminology
|
||||||
|
|
||||||
1. Domains import as `pass` by default (auto-crawled)
|
- **domain_check**: DNS lookup to verify domain is live
|
||||||
2. Crawl loop picks up domains where `last_crawled_at IS NULL`
|
- **feed_crawl**: Crawl a live domain to discover RSS/Atom feeds
|
||||||
3. Full recursive crawl (HTTPS, fallback HTTP) up to MaxDepth=10, MaxPagesPerHost=10
|
- **feed_check**: Check a known feed for new items
|
||||||
|
|
||||||
|
### Domain Processing Flow
|
||||||
|
|
||||||
|
1. Domains import as `pass` by default
|
||||||
|
2. Domain loop runs **domain_check** (DNS lookup) for unchecked domains
|
||||||
|
3. Domain loop runs **feed_crawl** for checked domains (recursive crawl up to MaxDepth=10, MaxPagesPerHost=10)
|
||||||
4. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
|
4. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
|
||||||
5. Parse discovered feeds for metadata, save with next_crawl_at
|
5. Parse discovered feeds for metadata, save with `next_check_at`
|
||||||
|
|
||||||
### Feed Checking
|
### Feed Checking
|
||||||
|
|
||||||
Uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 100s per consecutive no-change. Respects RSS `<ttl>` and Syndication namespace hints.
|
**feed_check** uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 100s per consecutive no-change. Respects RSS `<ttl>` and Syndication namespace hints.
|
||||||
|
|
||||||
### Publishing
|
### Publishing
|
||||||
|
|
||||||
|
|||||||
+4
-4
@@ -1,9 +1,9 @@
|
|||||||
FROM golang:1.24-alpine AS builder
|
FROM golang:latest AS builder
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install build dependencies
|
# Install build dependencies
|
||||||
RUN apk add --no-cache gcc musl-dev
|
RUN apt-get update && apt-get install -y gcc && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy go mod files first for layer caching
|
# Copy go mod files first for layer caching
|
||||||
COPY go.mod go.sum ./
|
COPY go.mod go.sum ./
|
||||||
@@ -17,12 +17,12 @@ COPY static/ ./static/
|
|||||||
RUN CGO_ENABLED=1 go build -o 1440.news .
|
RUN CGO_ENABLED=1 go build -o 1440.news .
|
||||||
|
|
||||||
# Runtime stage
|
# Runtime stage
|
||||||
FROM alpine:latest
|
FROM ubuntu:latest
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install runtime dependencies
|
# Install runtime dependencies
|
||||||
RUN apk add --no-cache ca-certificates tzdata
|
RUN apt-get update && apt-get install -y ca-certificates tzdata && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy binary from builder
|
# Copy binary from builder
|
||||||
COPY --from=builder /app/1440.news .
|
COPY --from=builder /app/1440.news .
|
||||||
|
|||||||
+9
-9
@@ -26,8 +26,8 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
Language string `json:"language,omitempty"`
|
Language string `json:"language,omitempty"`
|
||||||
SiteURL string `json:"siteUrl,omitempty"`
|
SiteURL string `json:"siteUrl,omitempty"`
|
||||||
DiscoveredAt string `json:"discoveredAt,omitempty"`
|
DiscoveredAt string `json:"discoveredAt,omitempty"`
|
||||||
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
|
LastCheckedAt string `json:"lastCheckedAt,omitempty"`
|
||||||
NextCrawlAt string `json:"nextCrawlAt,omitempty"`
|
NextCheckAt string `json:"nextCheckAt,omitempty"`
|
||||||
LastBuildDate string `json:"lastBuildDate,omitempty"`
|
LastBuildDate string `json:"lastBuildDate,omitempty"`
|
||||||
Status string `json:"status,omitempty"`
|
Status string `json:"status,omitempty"`
|
||||||
LastError string `json:"lastError,omitempty"`
|
LastError string `json:"lastError,omitempty"`
|
||||||
@@ -40,7 +40,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
var f FeedDetails
|
var f FeedDetails
|
||||||
var category, title, description, language, siteUrl *string
|
var category, title, description, language, siteUrl *string
|
||||||
var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time
|
var lastCheckedAt, nextCheckAt, lastBuildDate *time.Time
|
||||||
var status, lastError *string
|
var status, lastError *string
|
||||||
var oldestItemDate, newestItemDate *time.Time
|
var oldestItemDate, newestItemDate *time.Time
|
||||||
var itemCount *int
|
var itemCount *int
|
||||||
@@ -49,7 +49,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
err := c.db.QueryRow(`
|
err := c.db.QueryRow(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
status, last_error,
|
status, last_error,
|
||||||
(SELECT COUNT(*) FROM items WHERE feed_url = feeds.url) as item_count,
|
(SELECT COUNT(*) FROM items WHERE feed_url = feeds.url) as item_count,
|
||||||
oldest_item_date, newest_item_date,
|
oldest_item_date, newest_item_date,
|
||||||
@@ -57,7 +57,7 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
FROM feeds WHERE url = $1
|
FROM feeds WHERE url = $1
|
||||||
`, feedURL).Scan(
|
`, feedURL).Scan(
|
||||||
&f.URL, &f.Type, &category, &title, &description, &language, &siteUrl,
|
&f.URL, &f.Type, &category, &title, &description, &language, &siteUrl,
|
||||||
&discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
&discoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||||
&status, &lastError,
|
&status, &lastError,
|
||||||
&itemCount, &oldestItemDate, &newestItemDate,
|
&itemCount, &oldestItemDate, &newestItemDate,
|
||||||
&publishStatus, &publishAccount,
|
&publishStatus, &publishAccount,
|
||||||
@@ -78,11 +78,11 @@ func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
f.Language = StringValue(language)
|
f.Language = StringValue(language)
|
||||||
f.SiteURL = StringValue(siteUrl)
|
f.SiteURL = StringValue(siteUrl)
|
||||||
f.DiscoveredAt = discoveredAt.Format(time.RFC3339)
|
f.DiscoveredAt = discoveredAt.Format(time.RFC3339)
|
||||||
if lastCrawledAt != nil {
|
if lastCheckedAt != nil {
|
||||||
f.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
|
f.LastCheckedAt = lastCheckedAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if nextCrawlAt != nil {
|
if nextCheckAt != nil {
|
||||||
f.NextCrawlAt = nextCrawlAt.Format(time.RFC3339)
|
f.NextCheckAt = nextCheckAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if lastBuildDate != nil {
|
if lastBuildDate != nil {
|
||||||
f.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
f.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
||||||
|
|||||||
+30
-27
@@ -4,6 +4,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/jackc/pgx/v5"
|
"github.com/jackc/pgx/v5"
|
||||||
@@ -16,16 +17,16 @@ type SearchResult struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type SearchFeed struct {
|
type SearchFeed struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Category string `json:"category"`
|
Category string `json:"category"`
|
||||||
Title string `json:"title"`
|
Title string `json:"title"`
|
||||||
Description string `json:"description"`
|
Description string `json:"description"`
|
||||||
Language string `json:"language"`
|
Language string `json:"language"`
|
||||||
SiteURL string `json:"site_url"`
|
SiteURL string `json:"site_url"`
|
||||||
DiscoveredAt string `json:"discovered_at"`
|
DiscoveredAt string `json:"discovered_at"`
|
||||||
LastCrawledAt string `json:"last_crawled_at"`
|
LastCheckedAt string `json:"last_checked_at"`
|
||||||
NextCrawlAt string `json:"next_crawl_at"`
|
NextCheckAt string `json:"next_check_at"`
|
||||||
LastBuildDate string `json:"last_build_date"`
|
LastBuildDate string `json:"last_build_date"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
LastError string `json:"last_error"`
|
LastError string `json:"last_error"`
|
||||||
@@ -76,7 +77,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
var url string
|
var url string
|
||||||
var feedType, category, title, description, language, siteUrl *string
|
var feedType, category, title, description, language, siteUrl *string
|
||||||
var discoveredAt time.Time
|
var discoveredAt time.Time
|
||||||
var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time
|
var lastCheckedAt, nextCheckAt, lastBuildDate *time.Time
|
||||||
var itemCount *int
|
var itemCount *int
|
||||||
var status, lastError *string
|
var status, lastError *string
|
||||||
var lastErrorAt *time.Time
|
var lastErrorAt *time.Time
|
||||||
@@ -85,7 +86,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
var noUpdate *bool
|
var noUpdate *bool
|
||||||
|
|
||||||
if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl,
|
if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl,
|
||||||
&discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
&discoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||||
&status, &lastError, &lastErrorAt,
|
&status, &lastError, &lastErrorAt,
|
||||||
&sourceUrl, &sourceHost, &tld,
|
&sourceUrl, &sourceHost, &tld,
|
||||||
&itemCount, &oldestItemDate, &newestItemDate, &noUpdate); err != nil {
|
&itemCount, &oldestItemDate, &newestItemDate, &noUpdate); err != nil {
|
||||||
@@ -110,11 +111,11 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
SourceHost: StringValue(sourceHost),
|
SourceHost: StringValue(sourceHost),
|
||||||
TLD: StringValue(tld),
|
TLD: StringValue(tld),
|
||||||
}
|
}
|
||||||
if lastCrawledAt != nil {
|
if lastCheckedAt != nil {
|
||||||
sf.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
|
sf.LastCheckedAt = lastCheckedAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if nextCrawlAt != nil {
|
if nextCheckAt != nil {
|
||||||
sf.NextCrawlAt = nextCrawlAt.Format(time.RFC3339)
|
sf.NextCheckAt = nextCheckAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if lastBuildDate != nil {
|
if lastBuildDate != nil {
|
||||||
sf.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
sf.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
||||||
@@ -138,16 +139,18 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Search feeds by source_host (LIKE search for domain matching)
|
// Search feeds by source_host (LIKE search for domain matching)
|
||||||
|
// Use LOWER() to leverage trigram index
|
||||||
|
lowerPattern := "%" + strings.ToLower(query) + "%"
|
||||||
hostRows, err := c.db.Query(`
|
hostRows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
item_count, oldest_item_date, newest_item_date, no_update
|
item_count, oldest_item_date, newest_item_date, no_update
|
||||||
FROM feeds
|
FROM feeds
|
||||||
WHERE source_host ILIKE $1 OR url ILIKE $1
|
WHERE LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1
|
||||||
LIMIT $2
|
LIMIT $2
|
||||||
`, "%"+query+"%", limit)
|
`, lowerPattern, limit)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
defer hostRows.Close()
|
defer hostRows.Close()
|
||||||
for hostRows.Next() {
|
for hostRows.Next() {
|
||||||
@@ -163,7 +166,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
tsQuery := ToSearchQuery(query)
|
tsQuery := ToSearchQuery(query)
|
||||||
feedRows, err := c.db.Query(`
|
feedRows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
item_count, oldest_item_date, newest_item_date, no_update
|
item_count, oldest_item_date, newest_item_date, no_update
|
||||||
@@ -228,7 +231,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
// Fetch feed info for this item's feed
|
// Fetch feed info for this item's feed
|
||||||
var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string
|
var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string
|
||||||
var fDiscoveredAt time.Time
|
var fDiscoveredAt time.Time
|
||||||
var fLastCrawledAt, fNextCrawlAt, fLastBuildDate *time.Time
|
var fLastCheckedAt, fNextCheckAt, fLastBuildDate *time.Time
|
||||||
var fItemCount *int
|
var fItemCount *int
|
||||||
var fStatus, fLastError *string
|
var fStatus, fLastError *string
|
||||||
var fLastErrorAt *time.Time
|
var fLastErrorAt *time.Time
|
||||||
@@ -238,13 +241,13 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
c.db.QueryRow(`
|
c.db.QueryRow(`
|
||||||
SELECT type, category, title, description, language, site_url,
|
SELECT type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
item_count, oldest_item_date, newest_item_date, no_update
|
item_count, oldest_item_date, newest_item_date, no_update
|
||||||
FROM feeds WHERE url = $1
|
FROM feeds WHERE url = $1
|
||||||
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
|
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
|
||||||
&fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate,
|
&fDiscoveredAt, &fLastCheckedAt, &fNextCheckAt, &fLastBuildDate,
|
||||||
&fStatus, &fLastError, &fLastErrorAt,
|
&fStatus, &fLastError, &fLastErrorAt,
|
||||||
&fSourceUrl, &fSourceHost, &fTLD,
|
&fSourceUrl, &fSourceHost, &fTLD,
|
||||||
&fItemCount, &fOldestItemDate, &fNewestItemDate, &fNoUpdate)
|
&fItemCount, &fOldestItemDate, &fNewestItemDate, &fNoUpdate)
|
||||||
@@ -268,11 +271,11 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
SourceHost: StringValue(fSourceHost),
|
SourceHost: StringValue(fSourceHost),
|
||||||
TLD: StringValue(fTLD),
|
TLD: StringValue(fTLD),
|
||||||
}
|
}
|
||||||
if fLastCrawledAt != nil {
|
if fLastCheckedAt != nil {
|
||||||
sf.LastCrawledAt = fLastCrawledAt.Format(time.RFC3339)
|
sf.LastCheckedAt = fLastCheckedAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if fNextCrawlAt != nil {
|
if fNextCheckAt != nil {
|
||||||
sf.NextCrawlAt = fNextCrawlAt.Format(time.RFC3339)
|
sf.NextCheckAt = fNextCheckAt.Format(time.RFC3339)
|
||||||
}
|
}
|
||||||
if fLastBuildDate != nil {
|
if fLastBuildDate != nil {
|
||||||
sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339)
|
sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339)
|
||||||
|
|||||||
+91
-41
@@ -1,9 +1,11 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -15,23 +17,22 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
MaxDepth int
|
MaxDepth int
|
||||||
MaxPagesPerHost int
|
MaxPagesPerHost int
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
UserAgent string
|
UserAgent string
|
||||||
visited sync.Map
|
visited sync.Map
|
||||||
feedsMu sync.Mutex
|
feedsMu sync.Mutex
|
||||||
client *http.Client
|
client *http.Client
|
||||||
hostsProcessed int32
|
domainsCrawled int32 // feed_crawl: domains crawled for feed discovery
|
||||||
feedsChecked int32
|
domainsChecked int32 // domain_check: domains checked for liveness
|
||||||
startTime time.Time
|
feedsChecked int32 // feed_check: feeds checked for new items
|
||||||
db *DB
|
startTime time.Time
|
||||||
displayedCrawlRate int
|
db *DB
|
||||||
displayedCheckRate int
|
domainsImported int32
|
||||||
domainsImported int32
|
cachedStats *DashboardStats
|
||||||
cachedStats *DashboardStats
|
cachedAllDomains []DomainStat
|
||||||
cachedAllDomains []DomainStat
|
statsMu sync.RWMutex
|
||||||
statsMu sync.RWMutex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCrawler(connString string) (*Crawler, error) {
|
func NewCrawler(connString string) (*Crawler, error) {
|
||||||
@@ -467,43 +468,92 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
|
|||||||
return items, nil
|
return items, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartCrawlLoop runs the domain crawling loop independently
|
// dnsResolver uses local caching DNS (infra-dns) with fallback to system
|
||||||
func (c *Crawler) StartCrawlLoop() {
|
var dnsResolver = &net.Resolver{
|
||||||
numWorkers := 100
|
PreferGo: true,
|
||||||
|
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
||||||
|
d := net.Dialer{Timeout: 2 * time.Second}
|
||||||
|
// Try local caching DNS first (CoreDNS on proxy network)
|
||||||
|
conn, err := d.DialContext(ctx, "udp", "infra-dns:53")
|
||||||
|
if err == nil {
|
||||||
|
return conn, nil
|
||||||
|
}
|
||||||
|
// Fallback to system DNS
|
||||||
|
return d.DialContext(ctx, network, address)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// domainCheck performs a DNS lookup to check if a domain resolves
|
||||||
|
func (c *Crawler) domainCheck(host string) error {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_, err := dnsResolver.LookupHost(ctx, host)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// StartDomainLoop runs the domain processing loop (domain_check + feed_crawl)
|
||||||
|
func (c *Crawler) StartDomainLoop() {
|
||||||
|
numWorkers := 1000
|
||||||
|
|
||||||
// Buffered channel for domain work
|
// Buffered channel for domain work
|
||||||
workChan := make(chan *Domain, 100)
|
workChan := make(chan *Domain, 1000)
|
||||||
|
|
||||||
// Start workers
|
// Start workers
|
||||||
for i := 0; i < numWorkers; i++ {
|
for i := 0; i < numWorkers; i++ {
|
||||||
go func() {
|
go func() {
|
||||||
for domain := range workChan {
|
for domain := range workChan {
|
||||||
feedsFound, crawlErr := c.crawlHost(domain.Host)
|
fh := domain.FullHost()
|
||||||
errStr := ""
|
if domain.CrawledAt.Equal(DomainStateUnchecked) {
|
||||||
if crawlErr != nil {
|
// domain_check: DNS lookup for liveness
|
||||||
errStr = crawlErr.Error()
|
err := c.domainCheck(fh)
|
||||||
}
|
errStr := ""
|
||||||
if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil {
|
if err != nil {
|
||||||
fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err)
|
errStr = err.Error()
|
||||||
|
}
|
||||||
|
if err := c.markDomainChecked(domain.Host, domain.TLD, errStr); err != nil {
|
||||||
|
fmt.Printf("Error marking domain %s as checked: %v\n", fh, err)
|
||||||
|
}
|
||||||
|
atomic.AddInt32(&c.domainsChecked, 1)
|
||||||
|
} else {
|
||||||
|
// feed_crawl: crawl domain to discover feeds
|
||||||
|
feedsFound, crawlErr := c.feedCrawl(fh)
|
||||||
|
errStr := ""
|
||||||
|
if crawlErr != nil {
|
||||||
|
errStr = crawlErr.Error()
|
||||||
|
}
|
||||||
|
if err := c.markDomainCrawled(domain.Host, domain.TLD, feedsFound, errStr); err != nil {
|
||||||
|
fmt.Printf("Error marking domain %s as crawled: %v\n", fh, err)
|
||||||
|
}
|
||||||
|
atomic.AddInt32(&c.domainsCrawled, 1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
const fetchSize = 100
|
const fetchSize = 1000
|
||||||
for {
|
for {
|
||||||
domains, err := c.GetDomainsToCrawl(fetchSize)
|
domains, err := c.GetDomainsToProcess(fetchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("Error fetching domains to crawl: %v\n", err)
|
fmt.Printf("Error fetching domains to process: %v\n", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(domains) == 0 {
|
if len(domains) == 0 {
|
||||||
c.displayedCrawlRate = 0
|
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("%s crawl: %d domains to crawl\n", time.Now().Format("15:04:05"), len(domains))
|
// Count unchecked vs checked for logging
|
||||||
|
unchecked := 0
|
||||||
|
for _, d := range domains {
|
||||||
|
if d.CrawledAt.Equal(DomainStateUnchecked) {
|
||||||
|
unchecked++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
checked := len(domains) - unchecked
|
||||||
|
|
||||||
|
if unchecked > 0 || checked > 0 {
|
||||||
|
fmt.Printf("%s domain: %d domain_check, %d feed_crawl\n", time.Now().Format("15:04:05"), unchecked, checked)
|
||||||
|
}
|
||||||
|
|
||||||
for _, domain := range domains {
|
for _, domain := range domains {
|
||||||
workChan <- domain
|
workChan <- domain
|
||||||
@@ -513,12 +563,12 @@ func (c *Crawler) StartCrawlLoop() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartCheckLoop runs the feed checking loop independently
|
// StartFeedCheckLoop runs the feed_check loop (checking feeds for new items)
|
||||||
func (c *Crawler) StartCheckLoop() {
|
func (c *Crawler) StartFeedCheckLoop() {
|
||||||
numWorkers := 100
|
numWorkers := 1000
|
||||||
|
|
||||||
// Buffered channel for feed work
|
// Buffered channel for feed work
|
||||||
workChan := make(chan *Feed, 100)
|
workChan := make(chan *Feed, 1000)
|
||||||
|
|
||||||
// Start workers
|
// Start workers
|
||||||
for i := 0; i < numWorkers; i++ {
|
for i := 0; i < numWorkers; i++ {
|
||||||
@@ -537,12 +587,11 @@ func (c *Crawler) StartCheckLoop() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(feeds) == 0 {
|
if len(feeds) == 0 {
|
||||||
c.displayedCheckRate = 0
|
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds))
|
fmt.Printf("%s feed_check: %d feeds\n", time.Now().Format("15:04:05"), len(feeds))
|
||||||
|
|
||||||
for _, feed := range feeds {
|
for _, feed := range feeds {
|
||||||
workChan <- feed
|
workChan <- feed
|
||||||
@@ -552,8 +601,9 @@ func (c *Crawler) StartCheckLoop() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
|
// feedCrawl crawls a domain to discover RSS/Atom feeds
|
||||||
atomic.AddInt32(&c.hostsProcessed, 1)
|
func (c *Crawler) feedCrawl(host string) (feedsFound int, err error) {
|
||||||
|
atomic.AddInt32(&c.domainsCrawled, 1)
|
||||||
|
|
||||||
localVisited := make(map[string]bool)
|
localVisited := make(map[string]bool)
|
||||||
pagesVisited := 0
|
pagesVisited := 0
|
||||||
|
|||||||
+55
-22
@@ -12,17 +12,26 @@ type DashboardStats struct {
|
|||||||
HoldDomains int `json:"hold_domains"`
|
HoldDomains int `json:"hold_domains"`
|
||||||
PassDomains int `json:"pass_domains"`
|
PassDomains int `json:"pass_domains"`
|
||||||
SkipDomains int `json:"skip_domains"`
|
SkipDomains int `json:"skip_domains"`
|
||||||
|
DeadDomains int `json:"dead_domains"`
|
||||||
|
|
||||||
// Feed stats
|
// Feed stats
|
||||||
TotalFeeds int `json:"total_feeds"`
|
TotalFeeds int `json:"total_feeds"`
|
||||||
|
AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds)
|
||||||
|
PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing)
|
||||||
|
SkipFeeds int `json:"skip_feeds"`
|
||||||
|
HoldFeeds int `json:"hold_feeds"`
|
||||||
|
DeadFeeds int `json:"dead_feeds"`
|
||||||
|
EmptyFeeds int `json:"empty_feeds"`
|
||||||
RSSFeeds int `json:"rss_feeds"`
|
RSSFeeds int `json:"rss_feeds"`
|
||||||
AtomFeeds int `json:"atom_feeds"`
|
AtomFeeds int `json:"atom_feeds"`
|
||||||
|
JSONFeeds int `json:"json_feeds"`
|
||||||
UnknownFeeds int `json:"unknown_feeds"`
|
UnknownFeeds int `json:"unknown_feeds"`
|
||||||
|
|
||||||
// Crawl progress
|
// Processing rates (per minute)
|
||||||
HostsProcessed int32 `json:"hosts_processed"`
|
DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count
|
||||||
CrawlRate int `json:"crawl_rate"` // crawls per minute
|
DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute
|
||||||
CheckRate int `json:"check_rate"` // feed checks per minute
|
FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute
|
||||||
|
FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute
|
||||||
|
|
||||||
// Timing
|
// Timing
|
||||||
UpdatedAt time.Time `json:"updated_at"`
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
@@ -122,28 +131,15 @@ func (c *Crawler) GetDashboardStats() (*DashboardStats, error) {
|
|||||||
func (c *Crawler) calculateStats() (*DashboardStats, error) {
|
func (c *Crawler) calculateStats() (*DashboardStats, error) {
|
||||||
stats := &DashboardStats{
|
stats := &DashboardStats{
|
||||||
UpdatedAt: time.Now(),
|
UpdatedAt: time.Now(),
|
||||||
HostsProcessed: c.hostsProcessed,
|
DomainsCrawled: c.domainsCrawled,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate crawl rate (crawls per minute), smoothed by +/-1 per update
|
// Calculate rates (per minute)
|
||||||
elapsed := time.Since(c.startTime).Minutes()
|
elapsed := time.Since(c.startTime).Minutes()
|
||||||
if elapsed > 0 {
|
if elapsed > 0 {
|
||||||
actualRate := int(float64(c.hostsProcessed) / elapsed)
|
stats.DomainCheckRate = int(float64(c.domainsChecked) / elapsed)
|
||||||
if actualRate > c.displayedCrawlRate {
|
stats.FeedCrawlRate = int(float64(c.domainsCrawled) / elapsed)
|
||||||
c.displayedCrawlRate++
|
stats.FeedCheckRate = int(float64(c.feedsChecked) / elapsed)
|
||||||
} else if actualRate < c.displayedCrawlRate {
|
|
||||||
c.displayedCrawlRate--
|
|
||||||
}
|
|
||||||
stats.CrawlRate = c.displayedCrawlRate
|
|
||||||
|
|
||||||
// Calculate check rate (feed checks per minute), smoothed by +/-1 per update
|
|
||||||
actualCheckRate := int(float64(c.feedsChecked) / elapsed)
|
|
||||||
if actualCheckRate > c.displayedCheckRate {
|
|
||||||
c.displayedCheckRate++
|
|
||||||
} else if actualCheckRate < c.displayedCheckRate {
|
|
||||||
c.displayedCheckRate--
|
|
||||||
}
|
|
||||||
stats.CheckRate = c.displayedCheckRate
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get domain stats
|
// Get domain stats
|
||||||
@@ -186,6 +182,8 @@ func (c *Crawler) collectDomainStats(stats *DashboardStats) error {
|
|||||||
stats.PassDomains = count
|
stats.PassDomains = count
|
||||||
case "skip":
|
case "skip":
|
||||||
stats.SkipDomains = count
|
stats.SkipDomains = count
|
||||||
|
case "dead":
|
||||||
|
stats.DeadDomains = count
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := rows.Err(); err != nil {
|
if err := rows.Err(); err != nil {
|
||||||
@@ -202,6 +200,39 @@ func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get status counts
|
||||||
|
statusRows, err := c.db.Query("SELECT status, COUNT(*) FROM feeds GROUP BY status")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer statusRows.Close()
|
||||||
|
|
||||||
|
for statusRows.Next() {
|
||||||
|
var status *string
|
||||||
|
var count int
|
||||||
|
if err := statusRows.Scan(&status, &count); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if status != nil {
|
||||||
|
switch *status {
|
||||||
|
case "pass":
|
||||||
|
stats.AliveFeeds = count
|
||||||
|
case "skip":
|
||||||
|
stats.SkipFeeds = count
|
||||||
|
case "hold":
|
||||||
|
stats.HoldFeeds = count
|
||||||
|
case "dead":
|
||||||
|
stats.DeadFeeds = count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count feeds approved for publishing (publish_status='pass')
|
||||||
|
c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE publish_status = 'pass'").Scan(&stats.PublishFeeds)
|
||||||
|
|
||||||
|
// Count empty feeds (item_count = 0 or NULL)
|
||||||
|
c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE item_count IS NULL OR item_count = 0").Scan(&stats.EmptyFeeds)
|
||||||
|
|
||||||
// Single query to get all type counts (one index scan instead of three)
|
// Single query to get all type counts (one index scan instead of three)
|
||||||
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
|
rows, err := c.db.Query("SELECT type, COUNT(*) FROM feeds GROUP BY type")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -223,6 +254,8 @@ func (c *Crawler) collectFeedStats(stats *DashboardStats) error {
|
|||||||
stats.RSSFeeds = count
|
stats.RSSFeeds = count
|
||||||
case "atom":
|
case "atom":
|
||||||
stats.AtomFeeds = count
|
stats.AtomFeeds = count
|
||||||
|
case "json":
|
||||||
|
stats.JSONFeeds = count
|
||||||
default:
|
default:
|
||||||
stats.UnknownFeeds += count
|
stats.UnknownFeeds += count
|
||||||
}
|
}
|
||||||
|
|||||||
+4
-3
@@ -1,14 +1,15 @@
|
|||||||
services:
|
services:
|
||||||
app-1440-news:
|
app-1440-news:
|
||||||
build: .
|
build: .
|
||||||
container_name: app-1440-news
|
image: atproto-1440news-app
|
||||||
|
container_name: atproto-1440news-app
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
stop_grace_period: 30s
|
stop_grace_period: 30s
|
||||||
env_file:
|
env_file:
|
||||||
- pds.env
|
- pds.env
|
||||||
- oauth.env
|
- oauth.env
|
||||||
environment:
|
environment:
|
||||||
DB_HOST: atproto-postgres
|
DB_HOST: infra-postgres
|
||||||
DB_PORT: 5432
|
DB_PORT: 5432
|
||||||
DB_USER: news_1440
|
DB_USER: news_1440
|
||||||
DB_PASSWORD_FILE: /run/secrets/db_password
|
DB_PASSWORD_FILE: /run/secrets/db_password
|
||||||
@@ -54,7 +55,7 @@ services:
|
|||||||
|
|
||||||
secrets:
|
secrets:
|
||||||
db_password:
|
db_password:
|
||||||
file: ../postgres/secrets/news_1440_password.txt
|
file: ../../../infra/postgres/secrets/news_1440_password.txt
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
proxy:
|
proxy:
|
||||||
|
|||||||
@@ -14,19 +14,38 @@ import (
|
|||||||
"github.com/jackc/pgx/v5"
|
"github.com/jackc/pgx/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Domain represents a host to be crawled for feeds
|
// Domain represents a host to process for feeds
|
||||||
// Status: hold (pending review), pass (approved), skip (not processing)
|
// Status: hold (pending review), pass (approved), skip (not processing)
|
||||||
|
// CrawledAt: zero time = needs domain_check, +1 sec = needs feed_crawl, real time = done
|
||||||
type Domain struct {
|
type Domain struct {
|
||||||
Host string `json:"host"`
|
Host string `json:"host"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
DiscoveredAt time.Time `json:"discovered_at"`
|
CrawledAt time.Time `json:"crawled_at"`
|
||||||
LastCheckedAt time.Time `json:"last_checked_at,omitempty"`
|
FeedsFound int `json:"feeds_found,omitempty"`
|
||||||
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
FeedsFound int `json:"feeds_found,omitempty"`
|
TLD string `json:"tld,omitempty"`
|
||||||
LastError string `json:"last_error,omitempty"`
|
MissCount int `json:"miss_count,omitempty"`
|
||||||
TLD string `json:"tld,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MissCountThreshold is the number of consecutive errors before setting status to hold
|
||||||
|
const MissCountThreshold = 100
|
||||||
|
|
||||||
|
// ErrorRetryDelay is how long to wait before retrying a domain with errors (1 hour minimum)
|
||||||
|
// At 100 seconds actual rate due to queue, 100 misses = ~2.8 hours
|
||||||
|
// At 1 hour minimum delay, 100 misses = ~4+ days in practice
|
||||||
|
var ErrorRetryDelay = 1 * time.Hour
|
||||||
|
|
||||||
|
// FullHost returns the complete hostname (host + tld)
|
||||||
|
func (d *Domain) FullHost() string {
|
||||||
|
return fullHost(d.Host, d.TLD)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sentinel values for domain processing state
|
||||||
|
var (
|
||||||
|
DomainStateUnchecked = time.Time{} // 0001-01-01 00:00:00 - needs domain_check
|
||||||
|
DomainStateChecked = time.Time{}.Add(time.Second) // 0001-01-01 00:00:01 - needs feed_crawl
|
||||||
|
)
|
||||||
|
|
||||||
// shouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
|
// shouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
|
||||||
func shouldAutoSkipDomain(host string) bool {
|
func shouldAutoSkipDomain(host string) bool {
|
||||||
// Never skip our own domain
|
// Never skip our own domain
|
||||||
@@ -51,62 +70,63 @@ func shouldAutoSkipDomain(host string) bool {
|
|||||||
// saveDomain stores a domain in PostgreSQL
|
// saveDomain stores a domain in PostgreSQL
|
||||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||||
// Auto-skip domains matching spam patterns
|
// Auto-skip domains matching spam patterns
|
||||||
|
fh := domain.FullHost()
|
||||||
status := domain.Status
|
status := domain.Status
|
||||||
if shouldAutoSkipDomain(domain.Host) {
|
if shouldAutoSkipDomain(fh) {
|
||||||
status = "skip"
|
status = "skip"
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
|
INSERT INTO domains (host, status, crawled_at, feeds_found, last_error, tld)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
VALUES ($1, $2, $3, $4, $5, $6)
|
||||||
ON CONFLICT(host) DO UPDATE SET
|
ON CONFLICT(host, tld) DO UPDATE SET
|
||||||
status = EXCLUDED.status,
|
status = EXCLUDED.status,
|
||||||
last_checked_at = EXCLUDED.last_checked_at,
|
crawled_at = EXCLUDED.crawled_at,
|
||||||
last_crawled_at = EXCLUDED.last_crawled_at,
|
|
||||||
feeds_found = EXCLUDED.feeds_found,
|
feeds_found = EXCLUDED.feeds_found,
|
||||||
last_error = EXCLUDED.last_error,
|
last_error = EXCLUDED.last_error
|
||||||
tld = EXCLUDED.tld
|
`, stripTLD(fh), status, domain.CrawledAt,
|
||||||
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
|
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||||
NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// saveDomainTx stores a domain using a transaction
|
// saveDomainTx stores a domain using a transaction
|
||||||
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
||||||
// Auto-skip domains matching spam patterns
|
// Auto-skip domains matching spam patterns
|
||||||
|
fh := domain.FullHost()
|
||||||
status := domain.Status
|
status := domain.Status
|
||||||
if shouldAutoSkipDomain(domain.Host) {
|
if shouldAutoSkipDomain(fh) {
|
||||||
status = "skip"
|
status = "skip"
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err := tx.Exec(context.Background(), `
|
_, err := tx.Exec(context.Background(), `
|
||||||
INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
|
INSERT INTO domains (host, status, crawled_at, feeds_found, last_error, tld)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
VALUES ($1, $2, $3, $4, $5, $6)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host, tld) DO NOTHING
|
||||||
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
|
`, stripTLD(fh), status, domain.CrawledAt,
|
||||||
NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// domainExists checks if a domain already exists in the database
|
// domainExists checks if a domain already exists in the database
|
||||||
func (c *Crawler) domainExists(host string) bool {
|
func (c *Crawler) domainExists(host string) bool {
|
||||||
|
host = normalizeHost(host)
|
||||||
var exists bool
|
var exists bool
|
||||||
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1)", normalizeHost(host)).Scan(&exists)
|
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1 AND tld = $2)", stripTLD(host), getTLD(host)).Scan(&exists)
|
||||||
return err == nil && exists
|
return err == nil && exists
|
||||||
}
|
}
|
||||||
|
|
||||||
// getDomain retrieves a domain from PostgreSQL
|
// getDomain retrieves a domain from PostgreSQL
|
||||||
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||||
|
host = normalizeHost(host)
|
||||||
domain := &Domain{}
|
domain := &Domain{}
|
||||||
var lastCheckedAt, lastCrawledAt *time.Time
|
|
||||||
var lastError *string
|
var lastError *string
|
||||||
|
|
||||||
err := c.db.QueryRow(`
|
err := c.db.QueryRow(`
|
||||||
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
|
SELECT host, tld, status, crawled_at, feeds_found, last_error
|
||||||
FROM domains WHERE host = $1
|
FROM domains WHERE host = $1 AND tld = $2
|
||||||
`, normalizeHost(host)).Scan(
|
`, stripTLD(host), getTLD(host)).Scan(
|
||||||
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
|
&domain.Host, &domain.TLD, &domain.Status, &domain.CrawledAt,
|
||||||
&domain.FeedsFound, &lastError, &domain.TLD,
|
&domain.FeedsFound, &lastError,
|
||||||
)
|
)
|
||||||
|
|
||||||
if err == pgx.ErrNoRows {
|
if err == pgx.ErrNoRows {
|
||||||
@@ -116,21 +136,26 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
domain.LastCheckedAt = TimeValue(lastCheckedAt)
|
|
||||||
domain.LastCrawledAt = TimeValue(lastCrawledAt)
|
|
||||||
domain.LastError = StringValue(lastError)
|
domain.LastError = StringValue(lastError)
|
||||||
|
|
||||||
return domain, nil
|
return domain, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetDomainsToCrawl returns domains ready for crawling (status='pass', not yet crawled)
|
// GetDomainsToProcess returns domains needing processing (domain_check or feed_crawl)
|
||||||
func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
|
// crawled_at = zero time means needs domain_check, +1 sec means needs feed_crawl
|
||||||
|
// Domains with errors are retried when crawled_at < now (scheduled by ErrorRetryDelay)
|
||||||
|
func (c *Crawler) GetDomainsToProcess(limit int) ([]*Domain, error) {
|
||||||
|
now := time.Now()
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
|
SELECT host, status, crawled_at, feeds_found, last_error, tld
|
||||||
FROM domains WHERE status = 'pass' AND last_crawled_at IS NULL
|
FROM domains
|
||||||
ORDER BY discovered_at DESC
|
WHERE status = 'pass' AND (
|
||||||
LIMIT $1
|
(crawled_at < '0001-01-02' AND last_error IS NULL) -- new domains
|
||||||
`, limit)
|
OR (crawled_at < $1 AND last_error IS NOT NULL) -- retry errors after delay
|
||||||
|
)
|
||||||
|
ORDER BY crawled_at ASC
|
||||||
|
LIMIT $2
|
||||||
|
`, now, limit)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -139,23 +164,45 @@ func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
|
|||||||
return c.scanDomains(rows)
|
return c.scanDomains(rows)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// markDomainChecked updates a domain after domain_check (sets to +1 sec for feed_crawl)
|
||||||
|
// host parameter should be the stripped host (without TLD)
|
||||||
|
func (c *Crawler) markDomainChecked(host, tld, lastError string) error {
|
||||||
|
if lastError != "" {
|
||||||
|
// Increment miss_count, set to 'hold' only at threshold
|
||||||
|
// Schedule retry after ErrorRetryDelay
|
||||||
|
retryAt := time.Now().Add(ErrorRetryDelay)
|
||||||
|
_, err := c.db.Exec(`
|
||||||
|
UPDATE domains SET
|
||||||
|
crawled_at = $1,
|
||||||
|
last_error = $2,
|
||||||
|
miss_count = miss_count + 1,
|
||||||
|
status = CASE WHEN miss_count + 1 >= $3 THEN 'hold' ELSE status END
|
||||||
|
WHERE host = $4 AND tld = $5
|
||||||
|
`, retryAt, lastError, MissCountThreshold, host, tld)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Success - reset miss_count
|
||||||
|
_, err := c.db.Exec(`
|
||||||
|
UPDATE domains SET crawled_at = $1, last_error = NULL, miss_count = 0
|
||||||
|
WHERE host = $2 AND tld = $3
|
||||||
|
`, DomainStateChecked, host, tld)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// scanDomains is a helper to scan multiple domain rows
|
// scanDomains is a helper to scan multiple domain rows
|
||||||
func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
|
func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
|
||||||
var domains []*Domain
|
var domains []*Domain
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
domain := &Domain{}
|
domain := &Domain{}
|
||||||
var lastCheckedAt, lastCrawledAt *time.Time
|
|
||||||
var lastError *string
|
var lastError *string
|
||||||
|
|
||||||
if err := rows.Scan(
|
if err := rows.Scan(
|
||||||
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
|
&domain.Host, &domain.Status, &domain.CrawledAt,
|
||||||
&domain.FeedsFound, &lastError, &domain.TLD,
|
&domain.FeedsFound, &lastError, &domain.TLD,
|
||||||
); err != nil {
|
); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
domain.LastCheckedAt = TimeValue(lastCheckedAt)
|
|
||||||
domain.LastCrawledAt = TimeValue(lastCrawledAt)
|
|
||||||
domain.LastError = StringValue(lastError)
|
domain.LastError = StringValue(lastError)
|
||||||
|
|
||||||
domains = append(domains, domain)
|
domains = append(domains, domain)
|
||||||
@@ -164,20 +211,30 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
|
|||||||
return domains, rows.Err()
|
return domains, rows.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
// markDomainCrawled updates a domain after the crawl stage
|
// markDomainCrawled updates a domain after feed_crawl (sets to NOW())
|
||||||
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
|
// host parameter should be the stripped host (without TLD)
|
||||||
now := time.Now()
|
func (c *Crawler) markDomainCrawled(host, tld string, feedsFound int, lastError string) error {
|
||||||
if lastError != "" {
|
if lastError != "" {
|
||||||
|
// Increment miss_count, set to 'hold' only at threshold
|
||||||
|
// Schedule retry after ErrorRetryDelay
|
||||||
|
retryAt := time.Now().Add(ErrorRetryDelay)
|
||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = $3
|
UPDATE domains SET
|
||||||
WHERE host = $4
|
crawled_at = $1,
|
||||||
`, now, feedsFound, lastError, normalizeHost(host))
|
feeds_found = $2,
|
||||||
|
last_error = $3,
|
||||||
|
miss_count = miss_count + 1,
|
||||||
|
status = CASE WHEN miss_count + 1 >= $4 THEN 'hold' ELSE status END
|
||||||
|
WHERE host = $5 AND tld = $6
|
||||||
|
`, retryAt, feedsFound, lastError, MissCountThreshold, host, tld)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// Success - reset miss_count
|
||||||
|
now := time.Now()
|
||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = NULL
|
UPDATE domains SET crawled_at = $1, feeds_found = $2, last_error = NULL, miss_count = 0
|
||||||
WHERE host = $3
|
WHERE host = $3 AND tld = $4
|
||||||
`, now, feedsFound, normalizeHost(host))
|
`, now, feedsFound, host, tld)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -193,13 +250,13 @@ func (c *Crawler) GetDomainCount() (total int, hold int, err error) {
|
|||||||
|
|
||||||
// ImportTestDomains adds a list of specific domains for testing
|
// ImportTestDomains adds a list of specific domains for testing
|
||||||
func (c *Crawler) ImportTestDomains(domains []string) {
|
func (c *Crawler) ImportTestDomains(domains []string) {
|
||||||
now := time.Now()
|
|
||||||
for _, host := range domains {
|
for _, host := range domains {
|
||||||
|
host = normalizeHost(host)
|
||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, tld)
|
INSERT INTO domains (host, status, tld)
|
||||||
VALUES ($1, 'pass', $2, $3)
|
VALUES ($1, 'pass', $2)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host, tld) DO NOTHING
|
||||||
`, host, now, getTLD(host))
|
`, stripTLD(host), getTLD(host))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("Error adding test domain %s: %v\n", host, err)
|
fmt.Printf("Error adding test domain %s: %v\n", host, err)
|
||||||
} else {
|
} else {
|
||||||
@@ -255,7 +312,6 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
scanner.Buffer(buf, 1024*1024)
|
scanner.Buffer(buf, 1024*1024)
|
||||||
|
|
||||||
const batchSize = 100
|
const batchSize = 100
|
||||||
now := time.Now()
|
|
||||||
totalImported := 0
|
totalImported := 0
|
||||||
batchCount := 0
|
batchCount := 0
|
||||||
|
|
||||||
@@ -299,14 +355,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
if shouldAutoSkipDomain(d.host) {
|
if shouldAutoSkipDomain(d.host) {
|
||||||
status = "skip"
|
status = "skip"
|
||||||
}
|
}
|
||||||
rows[i] = []interface{}{d.host, status, now, d.tld}
|
rows[i] = []interface{}{stripTLD(d.host), status, d.tld}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use CopyFrom for bulk insert
|
// Use CopyFrom for bulk insert
|
||||||
imported, err := conn.CopyFrom(
|
imported, err := conn.CopyFrom(
|
||||||
ctx,
|
ctx,
|
||||||
pgx.Identifier{"domains"},
|
pgx.Identifier{"domains"},
|
||||||
[]string{"host", "status", "discovered_at", "tld"},
|
[]string{"host", "status", "tld"},
|
||||||
pgx.CopyFromRows(rows),
|
pgx.CopyFromRows(rows),
|
||||||
)
|
)
|
||||||
conn.Release()
|
conn.Release()
|
||||||
@@ -319,10 +375,10 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
status = "skip"
|
status = "skip"
|
||||||
}
|
}
|
||||||
c.db.Exec(`
|
c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, tld)
|
INSERT INTO domains (host, status, tld)
|
||||||
VALUES ($1, $2, $3, $4)
|
VALUES ($1, $2, $3)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host, tld) DO NOTHING
|
||||||
`, d.host, status, now, d.tld)
|
`, stripTLD(d.host), status, d.tld)
|
||||||
}
|
}
|
||||||
imported = int64(len(domains))
|
imported = int64(len(domains))
|
||||||
}
|
}
|
||||||
@@ -369,7 +425,6 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
|||||||
buf := make([]byte, 0, 64*1024)
|
buf := make([]byte, 0, 64*1024)
|
||||||
scanner.Buffer(buf, 1024*1024)
|
scanner.Buffer(buf, 1024*1024)
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
count := 0
|
count := 0
|
||||||
const batchSize = 100
|
const batchSize = 100
|
||||||
|
|
||||||
@@ -408,10 +463,10 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
|||||||
status = "skip"
|
status = "skip"
|
||||||
}
|
}
|
||||||
result, err := c.db.Exec(`
|
result, err := c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, tld)
|
INSERT INTO domains (host, status, tld)
|
||||||
VALUES ($1, $2, $3, $4)
|
VALUES ($1, $2, $3)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host, tld) DO NOTHING
|
||||||
`, d.host, status, now, d.tld)
|
`, stripTLD(d.host), status, d.tld)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
skipped++
|
skipped++
|
||||||
} else if result > 0 {
|
} else if result > 0 {
|
||||||
|
|||||||
@@ -101,8 +101,8 @@ type Feed struct {
|
|||||||
|
|
||||||
// Timing
|
// Timing
|
||||||
DiscoveredAt time.Time `json:"discovered_at"`
|
DiscoveredAt time.Time `json:"discovered_at"`
|
||||||
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked
|
||||||
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
|
NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check
|
||||||
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
||||||
|
|
||||||
// Cache headers for conditional requests
|
// Cache headers for conditional requests
|
||||||
@@ -120,7 +120,7 @@ type Feed struct {
|
|||||||
TLD string `json:"tld,omitempty"`
|
TLD string `json:"tld,omitempty"`
|
||||||
|
|
||||||
// Content stats
|
// Content stats
|
||||||
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
|
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
|
||||||
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
||||||
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
||||||
|
|
||||||
@@ -153,7 +153,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
|||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
INSERT INTO feeds (
|
INSERT INTO feeds (
|
||||||
url, type, category, title, description, language, site_url,
|
url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -168,8 +168,8 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
|||||||
description = EXCLUDED.description,
|
description = EXCLUDED.description,
|
||||||
language = EXCLUDED.language,
|
language = EXCLUDED.language,
|
||||||
site_url = EXCLUDED.site_url,
|
site_url = EXCLUDED.site_url,
|
||||||
last_crawled_at = EXCLUDED.last_crawled_at,
|
last_checked_at = EXCLUDED.last_checked_at,
|
||||||
next_crawl_at = EXCLUDED.next_crawl_at,
|
next_check_at = EXCLUDED.next_check_at,
|
||||||
last_build_date = EXCLUDED.last_build_date,
|
last_build_date = EXCLUDED.last_build_date,
|
||||||
etag = EXCLUDED.etag,
|
etag = EXCLUDED.etag,
|
||||||
last_modified = EXCLUDED.last_modified,
|
last_modified = EXCLUDED.last_modified,
|
||||||
@@ -185,7 +185,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
|||||||
`,
|
`,
|
||||||
feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description),
|
feed.URL, feed.Type, feed.Category, NullableString(feed.Title), NullableString(feed.Description),
|
||||||
NullableString(feed.Language), NullableString(feed.SiteURL),
|
NullableString(feed.Language), NullableString(feed.SiteURL),
|
||||||
feed.DiscoveredAt, NullableTime(feed.LastCrawledAt), NullableTime(feed.NextCrawlAt), NullableTime(feed.LastBuildDate),
|
feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate),
|
||||||
NullableString(feed.ETag), NullableString(feed.LastModified),
|
NullableString(feed.ETag), NullableString(feed.LastModified),
|
||||||
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
|
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
|
||||||
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
|
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
|
||||||
@@ -200,14 +200,14 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
|||||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||||
feed := &Feed{}
|
feed := &Feed{}
|
||||||
var category, title, description, language, siteURL *string
|
var category, title, description, language, siteURL *string
|
||||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
||||||
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
||||||
var publishStatus, publishAccount *string
|
var publishStatus, publishAccount *string
|
||||||
var itemCount, noUpdate *int
|
var itemCount, noUpdate *int
|
||||||
|
|
||||||
err := c.db.QueryRow(`
|
err := c.db.QueryRow(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -217,7 +217,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|||||||
FROM feeds WHERE url = $1
|
FROM feeds WHERE url = $1
|
||||||
`, normalizeURL(feedURL)).Scan(
|
`, normalizeURL(feedURL)).Scan(
|
||||||
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
||||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||||
&etag, &lastModified,
|
&etag, &lastModified,
|
||||||
&feed.Status, &lastError, &lastErrorAt,
|
&feed.Status, &lastError, &lastErrorAt,
|
||||||
&sourceURL, &sourceHost, &tld,
|
&sourceURL, &sourceHost, &tld,
|
||||||
@@ -243,8 +243,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|||||||
feed.Description = StringValue(description)
|
feed.Description = StringValue(description)
|
||||||
feed.Language = StringValue(language)
|
feed.Language = StringValue(language)
|
||||||
feed.SiteURL = StringValue(siteURL)
|
feed.SiteURL = StringValue(siteURL)
|
||||||
feed.LastCrawledAt = TimeValue(lastCrawledAt)
|
feed.LastCheckedAt = TimeValue(lastCheckedAt)
|
||||||
feed.NextCrawlAt = TimeValue(nextCrawlAt)
|
feed.NextCheckAt = TimeValue(nextCheckAt)
|
||||||
feed.LastBuildDate = TimeValue(lastBuildDate)
|
feed.LastBuildDate = TimeValue(lastBuildDate)
|
||||||
feed.ETag = StringValue(etag)
|
feed.ETag = StringValue(etag)
|
||||||
feed.LastModified = StringValue(lastModified)
|
feed.LastModified = StringValue(lastModified)
|
||||||
@@ -282,7 +282,7 @@ func (c *Crawler) feedExists(feedURL string) bool {
|
|||||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -313,11 +313,11 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
|||||||
return count, err
|
return count, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetFeedsDueForCheck returns feeds where next_crawl_at <= now, ordered by no_update desc (prioritize infrequent feeds)
|
// GetFeedsDueForCheck returns feeds for feed_check, ordered by last_checked_at ASC (oldest first)
|
||||||
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -325,8 +325,8 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
|||||||
no_update,
|
no_update,
|
||||||
publish_status, publish_account
|
publish_status, publish_account
|
||||||
FROM feeds
|
FROM feeds
|
||||||
WHERE next_crawl_at <= NOW() AND status = 'pass'
|
WHERE last_checked_at > '0001-01-01 00:00:00' AND status = 'pass'
|
||||||
ORDER BY no_update DESC
|
ORDER BY last_checked_at ASC
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`, limit)
|
`, limit)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -341,7 +341,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
|||||||
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -363,7 +363,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
|||||||
tsquery := ToSearchQuery(query)
|
tsquery := ToSearchQuery(query)
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -389,7 +389,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
|||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
feed := &Feed{}
|
feed := &Feed{}
|
||||||
var feedType, category, title, description, language, siteURL *string
|
var feedType, category, title, description, language, siteURL *string
|
||||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
||||||
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
||||||
var itemCount, noUpdate *int
|
var itemCount, noUpdate *int
|
||||||
var status *string
|
var status *string
|
||||||
@@ -397,7 +397,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
|||||||
|
|
||||||
if err := rows.Scan(
|
if err := rows.Scan(
|
||||||
&feed.URL, &feedType, &category, &title, &description, &language, &siteURL,
|
&feed.URL, &feedType, &category, &title, &description, &language, &siteURL,
|
||||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||||
&etag, &lastModified,
|
&etag, &lastModified,
|
||||||
&status, &lastError, &lastErrorAt,
|
&status, &lastError, &lastErrorAt,
|
||||||
&sourceURL, &sourceHost, &tld,
|
&sourceURL, &sourceHost, &tld,
|
||||||
@@ -419,8 +419,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
|||||||
feed.Description = StringValue(description)
|
feed.Description = StringValue(description)
|
||||||
feed.Language = StringValue(language)
|
feed.Language = StringValue(language)
|
||||||
feed.SiteURL = StringValue(siteURL)
|
feed.SiteURL = StringValue(siteURL)
|
||||||
feed.LastCrawledAt = TimeValue(lastCrawledAt)
|
feed.LastCheckedAt = TimeValue(lastCheckedAt)
|
||||||
feed.NextCrawlAt = TimeValue(nextCrawlAt)
|
feed.NextCheckAt = TimeValue(nextCheckAt)
|
||||||
feed.LastBuildDate = TimeValue(lastBuildDate)
|
feed.LastBuildDate = TimeValue(lastBuildDate)
|
||||||
feed.ETag = StringValue(etag)
|
feed.ETag = StringValue(etag)
|
||||||
feed.LastModified = StringValue(lastModified)
|
feed.LastModified = StringValue(lastModified)
|
||||||
@@ -471,7 +471,7 @@ func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
|||||||
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
@@ -493,7 +493,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
|||||||
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||||
rows, err := c.db.Query(`
|
rows, err := c.db.Query(`
|
||||||
SELECT url, type, category, title, description, language, site_url,
|
SELECT url, type, category, title, description, language, site_url,
|
||||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||||
etag, last_modified,
|
etag, last_modified,
|
||||||
status, last_error, last_error_at,
|
status, last_error, last_error_at,
|
||||||
source_url, source_host, tld,
|
source_url, source_host, tld,
|
||||||
|
|||||||
+11
-11
@@ -31,7 +31,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
|||||||
Type: feedType,
|
Type: feedType,
|
||||||
Category: classifyFeed(feedURL),
|
Category: classifyFeed(feedURL),
|
||||||
DiscoveredAt: now,
|
DiscoveredAt: now,
|
||||||
LastCrawledAt: now,
|
LastCheckedAt: now,
|
||||||
Status: "pass",
|
Status: "pass",
|
||||||
SourceHost: sourceHost,
|
SourceHost: sourceHost,
|
||||||
TLD: getTLD(sourceHost),
|
TLD: getTLD(sourceHost),
|
||||||
@@ -53,8 +53,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
|||||||
// Refine category based on parsed title (e.g., "Comments on:")
|
// Refine category based on parsed title (e.g., "Comments on:")
|
||||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||||
|
|
||||||
// Calculate next crawl time
|
// Calculate next feed_check time
|
||||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
feed.NextCheckAt = c.calculateNextCheck(feed)
|
||||||
|
|
||||||
if err := c.saveFeed(feed); err != nil {
|
if err := c.saveFeed(feed); err != nil {
|
||||||
return
|
return
|
||||||
@@ -92,7 +92,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
|||||||
SourceURL: normalizeURL(sourceURL),
|
SourceURL: normalizeURL(sourceURL),
|
||||||
SourceHost: sourceHost,
|
SourceHost: sourceHost,
|
||||||
TLD: getTLD(sourceHost),
|
TLD: getTLD(sourceHost),
|
||||||
NextCrawlAt: now, // Should be crawled immediately
|
NextCheckAt: now, // Should be crawled immediately
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := c.saveFeed(feed); err != nil {
|
if err := c.saveFeed(feed); err != nil {
|
||||||
@@ -148,9 +148,9 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
err = fmt.Errorf("all URL variants failed")
|
err = fmt.Errorf("all URL variants failed")
|
||||||
}
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
feed.LastCrawledAt = now
|
feed.LastCheckedAt = now
|
||||||
feed.NoUpdate++
|
feed.NoUpdate++
|
||||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
feed.LastError = err.Error()
|
feed.LastError = err.Error()
|
||||||
feed.LastErrorAt = now
|
feed.LastErrorAt = now
|
||||||
feed.Status = "hold"
|
feed.Status = "hold"
|
||||||
@@ -165,13 +165,13 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
feed.LastCrawledAt = now
|
feed.LastCheckedAt = now
|
||||||
|
|
||||||
// 304 Not Modified - feed hasn't changed
|
// 304 Not Modified - feed hasn't changed
|
||||||
if resp.StatusCode == http.StatusNotModified {
|
if resp.StatusCode == http.StatusNotModified {
|
||||||
feed.NoUpdate++
|
feed.NoUpdate++
|
||||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
feed.LastError = ""
|
feed.LastError = ""
|
||||||
feed.Status = "pass"
|
feed.Status = "pass"
|
||||||
// Auto-hold feeds after 1000 consecutive no-changes
|
// Auto-hold feeds after 1000 consecutive no-changes
|
||||||
@@ -186,7 +186,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
// Non-200 response
|
// Non-200 response
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
feed.NoUpdate++
|
feed.NoUpdate++
|
||||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
feed.LastError = resp.Status
|
feed.LastError = resp.Status
|
||||||
feed.LastErrorAt = now
|
feed.LastErrorAt = now
|
||||||
feed.Status = "hold"
|
feed.Status = "hold"
|
||||||
@@ -203,7 +203,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
bodyBytes, err := io.ReadAll(resp.Body)
|
bodyBytes, err := io.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
feed.NoUpdate++
|
feed.NoUpdate++
|
||||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
feed.NextCheckAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
feed.LastError = err.Error()
|
feed.LastError = err.Error()
|
||||||
feed.LastErrorAt = now
|
feed.LastErrorAt = now
|
||||||
feed.Status = "hold"
|
feed.Status = "hold"
|
||||||
@@ -238,7 +238,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
|
|
||||||
// Content changed - reset backoff
|
// Content changed - reset backoff
|
||||||
feed.NoUpdate = 0
|
feed.NoUpdate = 0
|
||||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
feed.NextCheckAt = now.Add(100 * time.Second)
|
||||||
feed.LastError = ""
|
feed.LastError = ""
|
||||||
feed.Status = "pass"
|
feed.Status = "pass"
|
||||||
c.saveFeed(feed)
|
c.saveFeed(feed)
|
||||||
|
|||||||
@@ -386,8 +386,8 @@ func parseRSSDate(s string) (time.Time, error) {
|
|||||||
return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
|
return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculateNextCrawl determines when to next crawl this feed
|
// calculateNextCheck determines when to next check this feed (feed_check)
|
||||||
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
|
func (c *Crawler) calculateNextCheck(feed *Feed) time.Time {
|
||||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||||
return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -109,6 +109,9 @@ func (c *Crawler) StartDashboard(addr string) error {
|
|||||||
http.HandleFunc("/api/tlds", withAuth(func(w http.ResponseWriter, r *http.Request) {
|
http.HandleFunc("/api/tlds", withAuth(func(w http.ResponseWriter, r *http.Request) {
|
||||||
c.handleAPITLDs(w, r)
|
c.handleAPITLDs(w, r)
|
||||||
}))
|
}))
|
||||||
|
http.HandleFunc("/api/searchStats", withAuth(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
c.handleAPISearchStats(w, r)
|
||||||
|
}))
|
||||||
http.HandleFunc("/api/tldDomains", withAuth(func(w http.ResponseWriter, r *http.Request) {
|
http.HandleFunc("/api/tldDomains", withAuth(func(w http.ResponseWriter, r *http.Request) {
|
||||||
c.handleAPITLDDomains(w, r)
|
c.handleAPITLDDomains(w, r)
|
||||||
}))
|
}))
|
||||||
|
|||||||
@@ -1,42 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Deploy script - increments version, commits, pushes, and relaunches container
|
|
||||||
# Usage: ./scripts/deploy.sh [optional commit message]
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
|
|
||||||
# Extract current version number from templates.go
|
|
||||||
CURRENT=$(grep -o '>v[0-9]*<' templates.go | grep -o '[0-9]*' | head -1)
|
|
||||||
|
|
||||||
if [ -z "$CURRENT" ]; then
|
|
||||||
echo "Could not find version number in templates.go"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Increment version
|
|
||||||
NEW=$((CURRENT + 1))
|
|
||||||
|
|
||||||
# Update templates.go
|
|
||||||
sed -i '' "s/>v${CURRENT}</>v${NEW}</" templates.go
|
|
||||||
|
|
||||||
echo "Version: v${CURRENT} -> v${NEW}"
|
|
||||||
|
|
||||||
# Build commit message
|
|
||||||
if [ -n "$1" ]; then
|
|
||||||
COMMIT_MSG="v${NEW}: $1"
|
|
||||||
else
|
|
||||||
COMMIT_MSG="v${NEW}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Commit and push
|
|
||||||
git add -A
|
|
||||||
git commit -m "$COMMIT_MSG"
|
|
||||||
git push
|
|
||||||
|
|
||||||
echo "Committed: $COMMIT_MSG"
|
|
||||||
|
|
||||||
# Rebuild and relaunch
|
|
||||||
docker compose up -d --build
|
|
||||||
|
|
||||||
echo "Deployed v${NEW}"
|
|
||||||
+451
-65
@@ -14,6 +14,142 @@ function initDashboard() {
|
|||||||
let infiniteScrollState = null;
|
let infiniteScrollState = null;
|
||||||
let isLoadingMore = false;
|
let isLoadingMore = false;
|
||||||
let searchQuery = '';
|
let searchQuery = '';
|
||||||
|
let domainFilter = 'all'; // all, pass, skip, hold, dead
|
||||||
|
// Feed filter: multi-select with ALL as exclusion toggle
|
||||||
|
// When allSelected=true, selected items are EXCLUDED; when false, selected items are INCLUDED
|
||||||
|
let feedFilter = { allSelected: false, statuses: [], types: [] };
|
||||||
|
let currentOpenTLD = null; // Track which TLD is currently open
|
||||||
|
|
||||||
|
// Smart sticky header - scroll normally, show fixed on scroll up
|
||||||
|
let lastScrollY = 0;
|
||||||
|
const topSection = document.getElementById('topSection');
|
||||||
|
const spacer = document.getElementById('topSectionSpacer');
|
||||||
|
let headerHeight = topSection.offsetHeight;
|
||||||
|
let isFixed = false;
|
||||||
|
|
||||||
|
window.addEventListener('scroll', () => {
|
||||||
|
const currentScrollY = window.scrollY;
|
||||||
|
|
||||||
|
// If at top, return to normal flow
|
||||||
|
if (currentScrollY <= 0) {
|
||||||
|
topSection.classList.remove('fixed', 'hidden');
|
||||||
|
spacer.classList.remove('active');
|
||||||
|
isFixed = false;
|
||||||
|
lastScrollY = currentScrollY;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only activate fixed mode after scrolling past the header
|
||||||
|
if (currentScrollY > headerHeight) {
|
||||||
|
if (currentScrollY < lastScrollY) {
|
||||||
|
// Scrolling up - show fixed header
|
||||||
|
if (!isFixed) {
|
||||||
|
spacer.style.height = headerHeight + 'px';
|
||||||
|
spacer.classList.add('active');
|
||||||
|
topSection.classList.add('fixed');
|
||||||
|
// Start hidden, then show
|
||||||
|
topSection.classList.add('hidden');
|
||||||
|
requestAnimationFrame(() => {
|
||||||
|
topSection.classList.remove('hidden');
|
||||||
|
});
|
||||||
|
isFixed = true;
|
||||||
|
} else {
|
||||||
|
topSection.classList.remove('hidden');
|
||||||
|
}
|
||||||
|
} else if (currentScrollY > lastScrollY && isFixed) {
|
||||||
|
// Scrolling down while fixed - hide it
|
||||||
|
topSection.classList.add('hidden');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastScrollY = currentScrollY;
|
||||||
|
}, { passive: true });
|
||||||
|
|
||||||
|
// Stat card click handler
|
||||||
|
document.addEventListener('click', (e) => {
|
||||||
|
const card = e.target.closest('.card.clickable');
|
||||||
|
if (!card) return;
|
||||||
|
|
||||||
|
const filterType = card.dataset.filter;
|
||||||
|
const status = card.dataset.status;
|
||||||
|
const type = card.dataset.type;
|
||||||
|
|
||||||
|
if (filterType === 'domain') {
|
||||||
|
// Remove active from domain cards only
|
||||||
|
document.querySelectorAll('.card.clickable[data-filter="domain"]').forEach(c => c.classList.remove('active'));
|
||||||
|
card.classList.add('active');
|
||||||
|
domainFilter = status || 'all';
|
||||||
|
|
||||||
|
// Update placeholder
|
||||||
|
const searchInput = document.getElementById('searchInput');
|
||||||
|
searchInput.placeholder = domainFilter === 'all' ? 'Search domains...' : `Showing ${domainFilter} domains...`;
|
||||||
|
|
||||||
|
// Reload TLD list with new filter
|
||||||
|
loadFeeds(searchQuery);
|
||||||
|
} else if (filterType === 'feed') {
|
||||||
|
const wasActive = card.classList.contains('active');
|
||||||
|
|
||||||
|
if (status === 'all') {
|
||||||
|
// ALL card toggles exclusion mode
|
||||||
|
if (wasActive) {
|
||||||
|
card.classList.remove('active');
|
||||||
|
feedFilter.allSelected = false;
|
||||||
|
} else {
|
||||||
|
card.classList.add('active');
|
||||||
|
feedFilter.allSelected = true;
|
||||||
|
}
|
||||||
|
} else if (status) {
|
||||||
|
// Status card (pass, skip, hold, dead) - multi-select
|
||||||
|
if (wasActive) {
|
||||||
|
card.classList.remove('active');
|
||||||
|
feedFilter.statuses = feedFilter.statuses.filter(s => s !== status);
|
||||||
|
} else {
|
||||||
|
card.classList.add('active');
|
||||||
|
feedFilter.statuses.push(status);
|
||||||
|
}
|
||||||
|
} else if (type) {
|
||||||
|
// Type card (rss, atom, json, unknown, empty) - multi-select
|
||||||
|
if (wasActive) {
|
||||||
|
card.classList.remove('active');
|
||||||
|
feedFilter.types = feedFilter.types.filter(t => t !== type);
|
||||||
|
} else {
|
||||||
|
card.classList.add('active');
|
||||||
|
feedFilter.types.push(type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reload TLD list with feed filter
|
||||||
|
loadFeeds(searchQuery);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Refresh only expanded TLD sections with new domain filter
|
||||||
|
function refreshExpandedTLDs() {
|
||||||
|
const expandedContainer = document.getElementById('expandedTLDContent');
|
||||||
|
if (expandedContainer && expandedContainer.style.display !== 'none' && expandedContainer.dataset.tld) {
|
||||||
|
// Mark as needing reload and reload
|
||||||
|
expandedContainer.dataset.loaded = 'false';
|
||||||
|
loadTLDDomains(expandedContainer, searchQuery);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply feed filter to currently visible feeds
|
||||||
|
function applyFeedFilter() {
|
||||||
|
document.querySelectorAll('.inline-feed-block').forEach(block => {
|
||||||
|
const feedStatus = block.dataset.status || 'hold';
|
||||||
|
const feedType = block.dataset.type || 'unknown';
|
||||||
|
|
||||||
|
let show = true;
|
||||||
|
if (feedFilter.status !== 'all' && feedStatus !== feedFilter.status) {
|
||||||
|
show = false;
|
||||||
|
}
|
||||||
|
if (feedFilter.type && feedType !== feedFilter.type) {
|
||||||
|
show = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
block.style.display = show ? 'block' : 'none';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Event delegation for domain-spacer clicks (toggle feeds)
|
// Event delegation for domain-spacer clicks (toggle feeds)
|
||||||
document.addEventListener('click', (e) => {
|
document.addEventListener('click', (e) => {
|
||||||
@@ -96,8 +232,8 @@ function initDashboard() {
|
|||||||
['Oldest Item', f.oldestItemDate],
|
['Oldest Item', f.oldestItemDate],
|
||||||
['Newest Item', f.newestItemDate],
|
['Newest Item', f.newestItemDate],
|
||||||
['Discovered', f.discoveredAt],
|
['Discovered', f.discoveredAt],
|
||||||
['Last Crawled', f.lastCrawledAt],
|
['Last Checked', f.lastCheckedAt],
|
||||||
['Next Crawl', f.nextCrawlAt],
|
['Next Check', f.nextCheckAt],
|
||||||
['Publish Status', f.publishStatus],
|
['Publish Status', f.publishStatus],
|
||||||
['Publish Account', f.publishAccount],
|
['Publish Account', f.publishAccount],
|
||||||
];
|
];
|
||||||
@@ -122,7 +258,8 @@ function initDashboard() {
|
|||||||
const items = await resp.json();
|
const items = await resp.json();
|
||||||
|
|
||||||
if (!items || items.length === 0) {
|
if (!items || items.length === 0) {
|
||||||
itemsDiv.innerHTML = '<span style="color: #666;">No items</span>';
|
// Just clear the items area, keep the feed visible
|
||||||
|
itemsDiv.innerHTML = '';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,7 +310,6 @@ function initDashboard() {
|
|||||||
function renderTLDHeader(tld) {
|
function renderTLDHeader(tld) {
|
||||||
return `<div class="tld-section" data-tld="${escapeHtml(tld)}">
|
return `<div class="tld-section" data-tld="${escapeHtml(tld)}">
|
||||||
<div class="tld-header" style="display: flex; align-items: center; padding: 10px; background: #1a1a1a; border-bottom: 1px solid #333; cursor: pointer; user-select: none;">
|
<div class="tld-header" style="display: flex; align-items: center; padding: 10px; background: #1a1a1a; border-bottom: 1px solid #333; cursor: pointer; user-select: none;">
|
||||||
<span class="tld-toggle" style="color: #666; margin-right: 10px;">▼</span>
|
|
||||||
<span style="color: #0af; font-weight: bold; font-size: 1.1em;">.${escapeHtml(tld)}</span>
|
<span style="color: #0af; font-weight: bold; font-size: 1.1em;">.${escapeHtml(tld)}</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="tld-content" style="display: block;">`;
|
<div class="tld-content" style="display: block;">`;
|
||||||
@@ -192,45 +328,163 @@ function initDashboard() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Event delegation for TLD header/footer clicks (toggle section)
|
// Event delegation for TLD clicks (toggle section)
|
||||||
document.addEventListener('click', (e) => {
|
document.addEventListener('click', (e) => {
|
||||||
const tldHeader = e.target.closest('.tld-header');
|
const tldHeader = e.target.closest('.tld-header');
|
||||||
const tldFooter = e.target.closest('.tld-footer');
|
const tldFooter = e.target.closest('.tld-footer');
|
||||||
|
const expandedContainer = document.getElementById('expandedTLDContent');
|
||||||
|
|
||||||
|
// Handle clicks in expanded container header
|
||||||
|
if (tldHeader && tldHeader.closest('#expandedTLDContent')) {
|
||||||
|
// Close the expanded content
|
||||||
|
const currentSection = document.querySelector('.tld-section.expanded');
|
||||||
|
if (currentSection) {
|
||||||
|
currentSection.classList.remove('expanded');
|
||||||
|
}
|
||||||
|
expandedContainer.style.display = 'none';
|
||||||
|
expandedContainer.innerHTML = '';
|
||||||
|
currentOpenTLD = null;
|
||||||
|
// Show TLD list again
|
||||||
|
const domainList = document.querySelector('.domain-list');
|
||||||
|
if (domainList) domainList.style.display = '';
|
||||||
|
updateStats(); // Revert to search or all stats
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle clicks on TLD cards
|
||||||
if (tldHeader || tldFooter) {
|
if (tldHeader || tldFooter) {
|
||||||
const section = (tldHeader || tldFooter).closest('.tld-section');
|
const section = (tldHeader || tldFooter).closest('.tld-section');
|
||||||
if (section) {
|
if (section) {
|
||||||
const content = section.querySelector('.tld-content');
|
const tld = section.dataset.tld;
|
||||||
const toggle = section.querySelector('.tld-toggle');
|
const isExpanded = section.classList.contains('expanded');
|
||||||
if (content) {
|
|
||||||
const isVisible = content.style.display !== 'none';
|
|
||||||
content.style.display = isVisible ? 'none' : 'block';
|
|
||||||
if (toggle) toggle.textContent = isVisible ? '▶' : '▼';
|
|
||||||
|
|
||||||
if (isVisible) {
|
if (isExpanded) {
|
||||||
// Closing - scroll to next TLD section
|
// Closing this TLD
|
||||||
const nextSection = section.nextElementSibling;
|
section.classList.remove('expanded');
|
||||||
if (nextSection && nextSection.classList.contains('tld-section')) {
|
expandedContainer.style.display = 'none';
|
||||||
nextSection.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
expandedContainer.innerHTML = '';
|
||||||
}
|
currentOpenTLD = null;
|
||||||
} else {
|
// Show TLD list again
|
||||||
// Opening - load domains if not already loaded
|
const domainList = document.querySelector('.domain-list');
|
||||||
if (section.dataset.loaded === 'false') {
|
if (domainList) domainList.style.display = '';
|
||||||
loadTLDDomains(section, searchQuery);
|
updateStats(); // Revert to search or all stats
|
||||||
}
|
} else {
|
||||||
}
|
// Close any other open TLD first
|
||||||
|
document.querySelectorAll('.tld-section.expanded').forEach(s => {
|
||||||
|
s.classList.remove('expanded');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Opening this TLD
|
||||||
|
section.classList.add('expanded');
|
||||||
|
currentOpenTLD = tld;
|
||||||
|
// Hide TLD list
|
||||||
|
const domainList = document.querySelector('.domain-list');
|
||||||
|
if (domainList) domainList.style.display = 'none';
|
||||||
|
// Show TLD stats (filtered by search if active)
|
||||||
|
const currentSearch = document.getElementById('searchInput').value.trim();
|
||||||
|
updateStatsForTLD(tld, currentSearch);
|
||||||
|
|
||||||
|
// Set up expanded container with header
|
||||||
|
expandedContainer.innerHTML = `
|
||||||
|
<div class="tld-header">
|
||||||
|
<span class="tld-name">.${escapeHtml(tld)}</span>
|
||||||
|
</div>
|
||||||
|
<div class="tld-content">
|
||||||
|
<div class="tld-loading" style="padding: 10px; color: #666;">Loading...</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
expandedContainer.style.display = 'block';
|
||||||
|
expandedContainer.dataset.tld = tld;
|
||||||
|
expandedContainer.dataset.loaded = 'false';
|
||||||
|
|
||||||
|
// Load domains
|
||||||
|
loadTLDDomains(expandedContainer, searchQuery);
|
||||||
|
|
||||||
|
// Scroll to expanded container
|
||||||
|
expandedContainer.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Update stats for a specific TLD (optionally filtered by search)
|
||||||
|
async function updateStatsForTLD(tld, search = '') {
|
||||||
|
try {
|
||||||
|
let url = `/api/tldStats?tld=${encodeURIComponent(tld)}`;
|
||||||
|
if (search) {
|
||||||
|
url += `&search=${encodeURIComponent(search)}`;
|
||||||
|
}
|
||||||
|
const resp = await fetch(url);
|
||||||
|
if (!resp.ok) return;
|
||||||
|
const stats = await resp.json();
|
||||||
|
|
||||||
|
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains || 0);
|
||||||
|
document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains || 0);
|
||||||
|
document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains || 0);
|
||||||
|
document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains || 0);
|
||||||
|
document.getElementById('deadDomains').textContent = commaFormat(stats.dead_domains || 0);
|
||||||
|
|
||||||
|
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds || 0);
|
||||||
|
document.getElementById('aliveFeeds').textContent = commaFormat(stats.alive_feeds || 0);
|
||||||
|
document.getElementById('publishFeeds').textContent = commaFormat(stats.publish_feeds || 0);
|
||||||
|
document.getElementById('skipFeeds').textContent = commaFormat(stats.skip_feeds || 0);
|
||||||
|
document.getElementById('holdFeeds').textContent = commaFormat(stats.hold_feeds || 0);
|
||||||
|
document.getElementById('deadFeeds').textContent = commaFormat(stats.dead_feeds || 0);
|
||||||
|
document.getElementById('emptyFeeds').textContent = commaFormat(stats.empty_feeds || 0);
|
||||||
|
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds || 0);
|
||||||
|
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds || 0);
|
||||||
|
document.getElementById('jsonFeeds').textContent = commaFormat(stats.json_feeds || 0);
|
||||||
|
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds || 0);
|
||||||
|
|
||||||
|
document.getElementById('updatedAt').textContent = search ? `Search "${search}" in .${tld}` : `Stats for .${tld}`;
|
||||||
|
} catch (err) {
|
||||||
|
console.error('TLD stats update failed:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update stats for search results
|
||||||
|
async function updateStatsForSearch(query) {
|
||||||
|
try {
|
||||||
|
const resp = await fetch(`/api/searchStats?search=${encodeURIComponent(query)}`);
|
||||||
|
if (!resp.ok) {
|
||||||
|
console.error('Search stats failed:', resp.status);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const stats = await resp.json();
|
||||||
|
|
||||||
|
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains || 0);
|
||||||
|
document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains || 0);
|
||||||
|
document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains || 0);
|
||||||
|
document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains || 0);
|
||||||
|
document.getElementById('deadDomains').textContent = commaFormat(stats.dead_domains || 0);
|
||||||
|
|
||||||
|
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds || 0);
|
||||||
|
document.getElementById('aliveFeeds').textContent = commaFormat(stats.alive_feeds || 0);
|
||||||
|
document.getElementById('publishFeeds').textContent = commaFormat(stats.publish_feeds || 0);
|
||||||
|
document.getElementById('skipFeeds').textContent = commaFormat(stats.skip_feeds || 0);
|
||||||
|
document.getElementById('holdFeeds').textContent = commaFormat(stats.hold_feeds || 0);
|
||||||
|
document.getElementById('deadFeeds').textContent = commaFormat(stats.dead_feeds || 0);
|
||||||
|
document.getElementById('emptyFeeds').textContent = commaFormat(stats.empty_feeds || 0);
|
||||||
|
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds || 0);
|
||||||
|
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds || 0);
|
||||||
|
document.getElementById('jsonFeeds').textContent = commaFormat(stats.json_feeds || 0);
|
||||||
|
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds || 0);
|
||||||
|
|
||||||
|
document.getElementById('updatedAt').textContent = `Search: "${query}"`;
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Search stats update failed:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Render domain row with feeds
|
// Render domain row with feeds
|
||||||
function renderDomainRow(d) {
|
function renderDomainRow(d) {
|
||||||
const status = d.status || 'hold';
|
const status = d.status || 'hold';
|
||||||
|
|
||||||
let html = `<div class="domain-block" data-host="${escapeHtml(d.host)}" data-status="${status}">`;
|
const fullDomain = d.tld ? d.host + '.' + d.tld : d.host;
|
||||||
|
let html = `<div class="domain-block" data-host="${escapeHtml(fullDomain)}" data-status="${status}">`;
|
||||||
html += `<div class="domain-row" style="display: flex; align-items: center; padding: 8px 10px; border-bottom: 1px solid #202020;">`;
|
html += `<div class="domain-row" style="display: flex; align-items: center; padding: 8px 10px; border-bottom: 1px solid #202020;">`;
|
||||||
html += renderStatusBtns(status, 'domain', d.host);
|
html += renderStatusBtns(status, 'domain', fullDomain);
|
||||||
html += `<a class="domain-name" href="https://${escapeHtml(d.host)}" target="_blank" style="color: #0af; text-decoration: none;">${escapeHtml(d.host)}</a>`;
|
html += `<a class="domain-name" href="https://${escapeHtml(fullDomain)}" target="_blank" style="color: #0af; text-decoration: none;">${escapeHtml(fullDomain)}</a>`;
|
||||||
|
|
||||||
if (d.last_error) {
|
if (d.last_error) {
|
||||||
html += `<span class="domain-spacer" style="color: #f66; margin-left: 10px; flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; cursor: pointer;" title="${escapeHtml(d.last_error)}">${escapeHtml(d.last_error)}</span>`;
|
html += `<span class="domain-spacer" style="color: #f66; margin-left: 10px; flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; cursor: pointer;" title="${escapeHtml(d.last_error)}">${escapeHtml(d.last_error)}</span>`;
|
||||||
@@ -244,7 +498,8 @@ function initDashboard() {
|
|||||||
html += '<div class="domain-feeds" style="display: block; margin-left: 10px; border-left: 2px solid #333; padding-left: 6px;">';
|
html += '<div class="domain-feeds" style="display: block; margin-left: 10px; border-left: 2px solid #333; padding-left: 6px;">';
|
||||||
d.feeds.forEach(f => {
|
d.feeds.forEach(f => {
|
||||||
const feedStatus = f.publish_status || 'hold';
|
const feedStatus = f.publish_status || 'hold';
|
||||||
html += `<div class="inline-feed-block" data-url="${escapeHtml(f.url)}" data-status="${feedStatus}">`;
|
const feedType = f.type || 'unknown';
|
||||||
|
html += `<div class="inline-feed-block" data-url="${escapeHtml(f.url)}" data-status="${feedStatus}" data-type="${feedType}">`;
|
||||||
html += `<div class="feed-row" style="display: flex; align-items: center; padding: 4px 0;">`;
|
html += `<div class="feed-row" style="display: flex; align-items: center; padding: 4px 0;">`;
|
||||||
|
|
||||||
html += `<span style="width: 48px; flex-shrink: 0; white-space: nowrap; margin-right: 6px; color: #666; text-align: center;">${escapeHtml(f.language || '')} </span>`;
|
html += `<span style="width: 48px; flex-shrink: 0; white-space: nowrap; margin-right: 6px; color: #666; text-align: center;">${escapeHtml(f.language || '')} </span>`;
|
||||||
@@ -341,7 +596,7 @@ function initDashboard() {
|
|||||||
|
|
||||||
async function loadFeeds(query = '') {
|
async function loadFeeds(query = '') {
|
||||||
const output = document.getElementById('output');
|
const output = document.getElementById('output');
|
||||||
output.innerHTML = '<div class="domain-list"></div><div id="infiniteLoader" style="text-align: center; padding: 10px; color: #666;">Loading TLDs...</div>';
|
output.innerHTML = '<div class="domain-list"></div><div id="expandedTLDContent" style="display: none;"></div><div id="infiniteLoader" style="text-align: center; padding: 10px; color: #666;">Loading TLDs...</div>';
|
||||||
|
|
||||||
// Disconnect previous observer if any
|
// Disconnect previous observer if any
|
||||||
if (tldObserver) {
|
if (tldObserver) {
|
||||||
@@ -349,26 +604,59 @@ function initDashboard() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Fetch all TLDs first
|
// Fetch TLDs with optional domain status filter, feed filter, and search
|
||||||
const tldsResp = await fetch('/api/tlds?has_feeds=true');
|
let tldsUrl = '/api/tlds';
|
||||||
|
const params = [];
|
||||||
|
if (domainFilter !== 'all') {
|
||||||
|
params.push(`status=${domainFilter}`);
|
||||||
|
}
|
||||||
|
// Add feed filter params if any are selected
|
||||||
|
if (feedFilter.allSelected || feedFilter.statuses.length > 0 || feedFilter.types.length > 0) {
|
||||||
|
if (feedFilter.allSelected) {
|
||||||
|
params.push('feedMode=exclude');
|
||||||
|
} else {
|
||||||
|
params.push('feedMode=include');
|
||||||
|
}
|
||||||
|
if (feedFilter.statuses.length > 0) {
|
||||||
|
params.push(`feedStatuses=${feedFilter.statuses.join(',')}`);
|
||||||
|
}
|
||||||
|
if (feedFilter.types.length > 0) {
|
||||||
|
params.push(`feedTypes=${feedFilter.types.join(',')}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (query) {
|
||||||
|
params.push(`search=${encodeURIComponent(query)}`);
|
||||||
|
}
|
||||||
|
if (params.length > 0) {
|
||||||
|
tldsUrl += '?' + params.join('&');
|
||||||
|
}
|
||||||
|
const tldsResp = await fetch(tldsUrl);
|
||||||
|
if (!tldsResp.ok) {
|
||||||
|
const errText = await tldsResp.text();
|
||||||
|
throw new Error(`HTTP ${tldsResp.status}: ${errText}`);
|
||||||
|
}
|
||||||
const tlds = await tldsResp.json();
|
const tlds = await tldsResp.json();
|
||||||
|
|
||||||
if (!tlds || tlds.length === 0) {
|
if (!tlds || tlds.length === 0) {
|
||||||
document.getElementById('infiniteLoader').textContent = 'No feeds found';
|
// Update stats for empty results
|
||||||
|
if (query) {
|
||||||
|
await updateStatsForSearch(query);
|
||||||
|
} else {
|
||||||
|
await updateStats();
|
||||||
|
}
|
||||||
|
document.getElementById('infiniteLoader').textContent = query ? 'No matches found' : 'No feeds found';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const container = output.querySelector('.domain-list');
|
const container = output.querySelector('.domain-list');
|
||||||
|
|
||||||
// Render all TLD sections as collapsed placeholders
|
// Render all TLD sections as card placeholders
|
||||||
tlds.forEach(t => {
|
tlds.forEach(t => {
|
||||||
const tld = t.tld || 'unknown';
|
const tld = t.tld || 'unknown';
|
||||||
container.insertAdjacentHTML('beforeend', `
|
container.insertAdjacentHTML('beforeend', `
|
||||||
<div class="tld-section" data-tld="${escapeHtml(tld)}" data-loaded="false">
|
<div class="tld-section" data-tld="${escapeHtml(tld)}" data-loaded="false">
|
||||||
<div class="tld-header" style="display: flex; align-items: center; padding: 10px; background: #1a1a1a; border-bottom: 1px solid #333; cursor: pointer; user-select: none;">
|
<div class="tld-header">
|
||||||
<span class="tld-toggle" style="color: #666; margin-right: 10px;">▶</span>
|
<span class="tld-name">.${escapeHtml(tld)}</span>
|
||||||
<span style="color: #0af; font-weight: bold; font-size: 1.1em;">.${escapeHtml(tld)}</span>
|
|
||||||
<span style="color: #666; margin-left: 10px; font-size: 0.9em;">(${t.domain_count} domains)</span>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="tld-content" style="display: none;">
|
<div class="tld-content" style="display: none;">
|
||||||
<div class="tld-loading" style="padding: 10px; color: #666;">Loading...</div>
|
<div class="tld-loading" style="padding: 10px; color: #666;">Loading...</div>
|
||||||
@@ -377,25 +665,49 @@ function initDashboard() {
|
|||||||
`);
|
`);
|
||||||
});
|
});
|
||||||
|
|
||||||
document.getElementById('infiniteLoader').textContent = `${tlds.length} TLDs loaded`;
|
document.getElementById('infiniteLoader').textContent = '';
|
||||||
|
|
||||||
// Set up IntersectionObserver for lazy loading (loads even when collapsed)
|
// Auto-expand if single TLD match, otherwise update stats for search/all
|
||||||
tldObserver = new IntersectionObserver((entries) => {
|
if (tlds.length === 1) {
|
||||||
entries.forEach(entry => {
|
const tld = tlds[0].tld;
|
||||||
if (entry.isIntersecting) {
|
const expandedContainer = document.getElementById('expandedTLDContent');
|
||||||
const section = entry.target;
|
const section = output.querySelector('.tld-section');
|
||||||
if (section.dataset.loaded === 'false') {
|
|
||||||
loadTLDDomains(section, query);
|
|
||||||
tldObserver.unobserve(section);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}, { rootMargin: '500px' });
|
|
||||||
|
|
||||||
// Observe all TLD sections
|
if (section && expandedContainer) {
|
||||||
container.querySelectorAll('.tld-section').forEach(section => {
|
// Mark as expanded
|
||||||
tldObserver.observe(section);
|
section.classList.add('expanded');
|
||||||
});
|
currentOpenTLD = tld;
|
||||||
|
// Hide TLD list
|
||||||
|
const domainList = document.querySelector('.domain-list');
|
||||||
|
if (domainList) domainList.style.display = 'none';
|
||||||
|
|
||||||
|
// Set up expanded container
|
||||||
|
expandedContainer.innerHTML = `
|
||||||
|
<div class="tld-header">
|
||||||
|
<span class="tld-name">.${escapeHtml(tld)}</span>
|
||||||
|
</div>
|
||||||
|
<div class="tld-content">
|
||||||
|
<div class="tld-loading" style="padding: 10px; color: #666;">Loading...</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
expandedContainer.style.display = 'block';
|
||||||
|
expandedContainer.dataset.tld = tld;
|
||||||
|
expandedContainer.dataset.loaded = 'false';
|
||||||
|
|
||||||
|
// Load domains
|
||||||
|
loadTLDDomains(expandedContainer, query);
|
||||||
|
|
||||||
|
// Show TLD stats (filtered by search if active)
|
||||||
|
await updateStatsForTLD(tld, query);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Multiple TLDs - show search or global stats
|
||||||
|
if (query) {
|
||||||
|
await updateStatsForSearch(query);
|
||||||
|
} else {
|
||||||
|
await updateStats();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
document.getElementById('infiniteLoader').textContent = 'Error: ' + err.message;
|
document.getElementById('infiniteLoader').textContent = 'Error: ' + err.message;
|
||||||
@@ -408,12 +720,30 @@ function initDashboard() {
|
|||||||
section.dataset.loaded = 'loading';
|
section.dataset.loaded = 'loading';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let url = `/api/domains?has_feeds=true&tld=${encodeURIComponent(tld)}&limit=500`;
|
let url = `/api/domains?tld=${encodeURIComponent(tld)}&limit=500`;
|
||||||
|
if (domainFilter !== 'all') {
|
||||||
|
url += `&status=${domainFilter}`;
|
||||||
|
}
|
||||||
if (query) {
|
if (query) {
|
||||||
url += `&search=${encodeURIComponent(query)}`;
|
url += `&search=${encodeURIComponent(query)}`;
|
||||||
}
|
}
|
||||||
|
// Apply feed filter if any feed cards are selected
|
||||||
|
if (feedFilter.allSelected || feedFilter.statuses.length > 0 || feedFilter.types.length > 0) {
|
||||||
|
if (feedFilter.allSelected) {
|
||||||
|
url += '&feedMode=exclude';
|
||||||
|
} else {
|
||||||
|
url += '&feedMode=include';
|
||||||
|
}
|
||||||
|
if (feedFilter.statuses.length > 0) {
|
||||||
|
url += `&feedStatuses=${feedFilter.statuses.join(',')}`;
|
||||||
|
}
|
||||||
|
if (feedFilter.types.length > 0) {
|
||||||
|
url += `&feedTypes=${feedFilter.types.join(',')}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const resp = await fetch(url);
|
const resp = await fetch(url);
|
||||||
|
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||||
const domains = await resp.json();
|
const domains = await resp.json();
|
||||||
|
|
||||||
const content = section.querySelector('.tld-content');
|
const content = section.querySelector('.tld-content');
|
||||||
@@ -449,34 +779,90 @@ function initDashboard() {
|
|||||||
|
|
||||||
// Search handler
|
// Search handler
|
||||||
const searchInput = document.getElementById('searchInput');
|
const searchInput = document.getElementById('searchInput');
|
||||||
let searchTimeout;
|
function doSearch() {
|
||||||
searchInput.addEventListener('input', () => {
|
searchQuery = searchInput.value.trim();
|
||||||
clearTimeout(searchTimeout);
|
loadFeeds(searchQuery);
|
||||||
searchTimeout = setTimeout(() => {
|
}
|
||||||
searchQuery = searchInput.value.trim();
|
|
||||||
loadFeeds(searchQuery);
|
// Search on button click
|
||||||
}, 300);
|
document.getElementById('searchBtn').addEventListener('click', doSearch);
|
||||||
|
|
||||||
|
// Clear button - clears search and resets all filters
|
||||||
|
document.getElementById('clearBtn').addEventListener('click', () => {
|
||||||
|
searchInput.value = '';
|
||||||
|
searchQuery = '';
|
||||||
|
// Reset filters to default
|
||||||
|
domainFilter = 'all';
|
||||||
|
feedFilter = { allSelected: false, statuses: [], types: [] };
|
||||||
|
// Reset active card styling
|
||||||
|
document.querySelectorAll('.card.clickable.active').forEach(c => c.classList.remove('active'));
|
||||||
|
document.querySelector('.card.clickable[data-filter="domain"][data-status="all"]')?.classList.add('active');
|
||||||
|
searchInput.placeholder = 'Search domains...';
|
||||||
|
// Close any expanded TLD
|
||||||
|
currentOpenTLD = null;
|
||||||
|
const expandedContainer = document.getElementById('expandedTLDContent');
|
||||||
|
if (expandedContainer) {
|
||||||
|
expandedContainer.style.display = 'none';
|
||||||
|
expandedContainer.innerHTML = '';
|
||||||
|
}
|
||||||
|
// Show TLD list if hidden
|
||||||
|
const domainList = document.querySelector('.domain-list');
|
||||||
|
if (domainList) domainList.style.display = '';
|
||||||
|
// Reload and update stats
|
||||||
|
loadFeeds();
|
||||||
});
|
});
|
||||||
|
|
||||||
// Initial load
|
// Search on Enter key
|
||||||
|
searchInput.addEventListener('keydown', (e) => {
|
||||||
|
if (e.key === 'Enter') {
|
||||||
|
e.preventDefault();
|
||||||
|
doSearch();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Initial load - set default active cards and load
|
||||||
|
document.querySelector('.card.clickable[data-filter="domain"][data-status="all"]')?.classList.add('active');
|
||||||
loadFeeds();
|
loadFeeds();
|
||||||
|
|
||||||
// Update stats periodically
|
// Update stats periodically
|
||||||
async function updateStats() {
|
async function updateStats() {
|
||||||
|
// Check actual input value for current search state
|
||||||
|
const currentSearch = document.getElementById('searchInput')?.value.trim() || '';
|
||||||
|
|
||||||
|
// Priority: open TLD > search query > all
|
||||||
|
if (currentOpenTLD) {
|
||||||
|
updateStatsForTLD(currentOpenTLD, currentSearch);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (currentSearch) {
|
||||||
|
updateStatsForSearch(currentSearch);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const resp = await fetch('/api/stats');
|
const resp = await fetch('/api/stats');
|
||||||
|
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||||
const stats = await resp.json();
|
const stats = await resp.json();
|
||||||
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
|
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
|
||||||
document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains);
|
document.getElementById('holdDomains').textContent = commaFormat(stats.hold_domains);
|
||||||
document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains);
|
document.getElementById('passDomains').textContent = commaFormat(stats.pass_domains);
|
||||||
document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains);
|
document.getElementById('skipDomains').textContent = commaFormat(stats.skip_domains);
|
||||||
document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate);
|
document.getElementById('deadDomains').textContent = commaFormat(stats.dead_domains);
|
||||||
document.getElementById('checkRate').textContent = commaFormat(stats.check_rate);
|
document.getElementById('domainCheckRate').textContent = commaFormat(stats.domain_check_rate);
|
||||||
|
document.getElementById('feedCrawlRate').textContent = commaFormat(stats.feed_crawl_rate);
|
||||||
|
document.getElementById('feedCheckRate').textContent = commaFormat(stats.feed_check_rate);
|
||||||
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
|
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
|
||||||
|
document.getElementById('aliveFeeds').textContent = commaFormat(stats.alive_feeds);
|
||||||
|
document.getElementById('publishFeeds').textContent = commaFormat(stats.publish_feeds);
|
||||||
|
document.getElementById('skipFeeds').textContent = commaFormat(stats.skip_feeds);
|
||||||
|
document.getElementById('holdFeeds').textContent = commaFormat(stats.hold_feeds);
|
||||||
|
document.getElementById('deadFeeds').textContent = commaFormat(stats.dead_feeds);
|
||||||
|
document.getElementById('emptyFeeds').textContent = commaFormat(stats.empty_feeds);
|
||||||
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
|
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
|
||||||
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
|
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
|
||||||
|
document.getElementById('jsonFeeds').textContent = commaFormat(stats.json_feeds);
|
||||||
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
|
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
|
||||||
document.getElementById('updatedAt').textContent = 'Last updated: ' + new Date().toLocaleString();
|
document.getElementById('updatedAt').textContent = 'All TLDs - ' + new Date().toLocaleTimeString();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('Stats update failed:', err);
|
console.error('Stats update failed:', err);
|
||||||
}
|
}
|
||||||
|
|||||||
+2
-2
@@ -445,8 +445,8 @@ const dashboardHTML = `<!DOCTYPE html>
|
|||||||
<title>1440.news Feed Crawler</title>
|
<title>1440.news Feed Crawler</title>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
<link rel="stylesheet" href="/static/dashboard.css?v=222">
|
<link rel="stylesheet" href="/static/dashboard.css?v=1769990750">
|
||||||
<script src="/static/dashboard.js?v=222"></script>
|
<script src="/static/dashboard.js?v=1769990750"></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="topSection">
|
<div id="topSection">
|
||||||
|
|||||||
Reference in New Issue
Block a user