Refactor large Go files into focused modules
Split dashboard.go (3,528 lines) into: - routes.go: HTTP route registration - api_domains.go: Domain API handlers - api_feeds.go: Feed API handlers - api_publish.go: Publishing API handlers - api_search.go: Search API handlers - templates.go: HTML templates - dashboard.go: Stats functions only (235 lines) Split publisher.go (1,502 lines) into: - pds_auth.go: Authentication and account management - pds_records.go: Record operations (upload, update, delete) - handle.go: Handle derivation from feed URLs - image.go: Image processing and favicon fetching - publisher.go: Core types and PublishItem (439 lines) Split feed.go (1,137 lines) into: - item.go: Item struct and DB operations - feed_check.go: Feed checking and processing - feed.go: Feed struct and DB operations (565 lines) Also includes domain import batch size increase (1k -> 100k). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+256
@@ -0,0 +1,256 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
feedType := c.detectFeedType(body)
|
||||
now := time.Now()
|
||||
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata and items
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
normalizedURL := normalizeURL(feedURL)
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
NextCrawlAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||
// Returns: changed (bool), error
|
||||
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
atomic.AddInt32(&c.feedsChecked, 1)
|
||||
|
||||
// Try different scheme/www combinations since we store URLs without scheme
|
||||
urlVariants := []string{
|
||||
"https://" + feed.URL,
|
||||
"http://" + feed.URL,
|
||||
"https://www." + feed.URL,
|
||||
"http://www." + feed.URL,
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
var err error
|
||||
var successURL string
|
||||
|
||||
for _, tryURL := range urlVariants {
|
||||
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||
if reqErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.UserAgent)
|
||||
|
||||
// Add conditional headers if we have them
|
||||
if feed.ETag != "" {
|
||||
req.Header.Set("If-None-Match", feed.ETag)
|
||||
}
|
||||
if feed.LastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||
}
|
||||
|
||||
resp, err = c.client.Do(req)
|
||||
if err == nil {
|
||||
successURL = tryURL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
_ = successURL // May be used later for logging/debugging
|
||||
|
||||
// If no request succeeded, resp will be nil
|
||||
if resp == nil {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("all URL variants failed")
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
|
||||
// 304 Not Modified - feed hasn't changed
|
||||
if resp.StatusCode == http.StatusNotModified {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
|
||||
body := string(bodyBytes)
|
||||
|
||||
// Update cache headers
|
||||
feed.ETag = resp.Header.Get("ETag")
|
||||
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||
|
||||
// Re-detect type and parse metadata
|
||||
feedType := c.detectFeedType(body)
|
||||
feed.Type = feedType
|
||||
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
Reference in New Issue
Block a user