Refactor large Go files into focused modules

Split dashboard.go (3,528 lines) into: - routes.go: HTTP route registration - api_domains.go: Domain API handlers - api_feeds.go: Feed API handlers - api_publish.go: Publishing API handlers - api_search.go: Search API handlers - templates.go: HTML templates - dashboard.go: Stats functions only (235 lines) Split publisher.go (1,502 lines) into: - pds_auth.go: Authentication and account management - pds_records.go: Record operations (upload, update, delete) - handle.go: Handle derivation from feed URLs - image.go: Image processing and favicon fetching - publisher.go: Core types and PublishItem (439 lines) Split feed.go (1,137 lines) into: - item.go: Item struct and DB operations - feed_check.go: Feed checking and processing - feed.go: Feed struct and DB operations (565 lines) Also includes domain import batch size increase (1k -> 100k). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 22:25:02 -05:00
parent 3999e96f26
commit 1066f42189
17 changed files with 5106 additions and 4957 deletions
@@ -0,0 +1,256 @@
+package main
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"sync/atomic"
+	"time"
+)
+
+// processFeed parses and stores a feed with full metadata
+func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
+	// Fast path: check without lock
+	if c.feedExists(feedURL) {
+		return
+	}
+
+	c.feedsMu.Lock()
+	defer c.feedsMu.Unlock()
+
+	// Double-check after acquiring lock
+	if c.feedExists(feedURL) {
+		return
+	}
+
+	feedType := c.detectFeedType(body)
+	now := time.Now()
+
+	feed := &Feed{
+		URL:           normalizeURL(feedURL),
+		Type:          feedType,
+		Category:      classifyFeed(feedURL),
+		DiscoveredAt:  now,
+		LastCrawledAt: now,
+		Status:        "active",
+		SourceHost:    sourceHost,
+		TLD:           getTLD(sourceHost),
+		ETag:          headers.Get("ETag"),
+		LastModified:  headers.Get("Last-Modified"),
+	}
+
+	// Parse feed-specific metadata and items
+	var items []*Item
+	switch feedType {
+	case "rss":
+		items = c.parseRSSMetadata(body, feed)
+	case "atom":
+		items = c.parseAtomMetadata(body, feed)
+	case "json":
+		items = c.parseJSONFeedMetadata(body, feed)
+	}
+
+	// Refine category based on parsed title (e.g., "Comments on:")
+	feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
+
+	// Calculate next crawl time
+	feed.NextCrawlAt = c.calculateNextCrawl(feed)
+
+	if err := c.saveFeed(feed); err != nil {
+		return
+	}
+
+	// Save items
+	if len(items) > 0 {
+		c.saveItems(items)
+	}
+}
+
+// addFeed adds a discovered feed URL (not yet fetched)
+func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
+	// Fast path: check without lock
+	if c.feedExists(feedURL) {
+		return
+	}
+
+	c.feedsMu.Lock()
+	defer c.feedsMu.Unlock()
+
+	// Double-check after acquiring lock
+	if c.feedExists(feedURL) {
+		return
+	}
+
+	now := time.Now()
+	normalizedURL := normalizeURL(feedURL)
+	feed := &Feed{
+		URL:          normalizedURL,
+		Type:         feedType,
+		Category:     classifyFeed(feedURL),
+		DiscoveredAt: now,
+		Status:       "active",
+		SourceURL:    normalizeURL(sourceURL),
+		SourceHost:   sourceHost,
+		TLD:          getTLD(sourceHost),
+		NextCrawlAt:  now, // Should be crawled immediately
+	}
+
+	if err := c.saveFeed(feed); err != nil {
+		return
+	}
+}
+
+// CheckFeed performs a conditional request to check if a feed has been updated
+// Returns: changed (bool), error
+func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
+	atomic.AddInt32(&c.feedsChecked, 1)
+
+	// Try different scheme/www combinations since we store URLs without scheme
+	urlVariants := []string{
+		"https://" + feed.URL,
+		"http://" + feed.URL,
+		"https://www." + feed.URL,
+		"http://www." + feed.URL,
+	}
+
+	var resp *http.Response
+	var err error
+	var successURL string
+
+	for _, tryURL := range urlVariants {
+		req, reqErr := http.NewRequest("GET", tryURL, nil)
+		if reqErr != nil {
+			continue
+		}
+
+		req.Header.Set("User-Agent", c.UserAgent)
+
+		// Add conditional headers if we have them
+		if feed.ETag != "" {
+			req.Header.Set("If-None-Match", feed.ETag)
+		}
+		if feed.LastModified != "" {
+			req.Header.Set("If-Modified-Since", feed.LastModified)
+		}
+
+		resp, err = c.client.Do(req)
+		if err == nil {
+			successURL = tryURL
+			break
+		}
+	}
+
+	_ = successURL // May be used later for logging/debugging
+
+	// If no request succeeded, resp will be nil
+	if resp == nil {
+		if err == nil {
+			err = fmt.Errorf("all URL variants failed")
+		}
+		now := time.Now()
+		feed.LastCrawledAt = now
+		feed.ErrorCount++
+		feed.NoUpdate++
+		feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
+		feed.LastError = err.Error()
+		feed.LastErrorAt = now
+		feed.Status = "error"
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
+		c.saveFeed(feed)
+		return false, err
+	}
+	defer resp.Body.Close()
+
+	now := time.Now()
+	feed.LastCrawledAt = now
+
+	// 304 Not Modified - feed hasn't changed
+	if resp.StatusCode == http.StatusNotModified {
+		feed.NoUpdate++
+		// Adaptive backoff: 100s base + 100s per consecutive no-change
+		feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
+		feed.ErrorCount = 0
+		feed.LastError = ""
+		feed.Status = "active"
+		c.saveFeed(feed)
+		return false, nil
+	}
+
+	// Non-200 response
+	if resp.StatusCode != http.StatusOK {
+		feed.ErrorCount++
+		feed.NoUpdate++
+		feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
+		feed.LastError = resp.Status
+		feed.LastErrorAt = now
+		if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
+			feed.Status = "dead"
+		} else {
+			feed.Status = "error"
+		}
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
+		c.saveFeed(feed)
+		return false, nil
+	}
+
+	// 200 OK - feed has new content
+	bodyBytes, err := io.ReadAll(resp.Body)
+	if err != nil {
+		feed.ErrorCount++
+		feed.NoUpdate++
+		feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
+		feed.LastError = err.Error()
+		feed.LastErrorAt = now
+		feed.Status = "error"
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
+		c.saveFeed(feed)
+		return false, err
+	}
+
+	body := string(bodyBytes)
+
+	// Update cache headers
+	feed.ETag = resp.Header.Get("ETag")
+	feed.LastModified = resp.Header.Get("Last-Modified")
+
+	// Re-detect type and parse metadata
+	feedType := c.detectFeedType(body)
+	feed.Type = feedType
+
+	var items []*Item
+	switch feedType {
+	case "rss":
+		items = c.parseRSSMetadata(body, feed)
+	case "atom":
+		items = c.parseAtomMetadata(body, feed)
+	case "json":
+		items = c.parseJSONFeedMetadata(body, feed)
+	}
+
+	// Content changed - reset backoff
+	feed.NoUpdate = 0
+	feed.NextCrawlAt = now.Add(100 * time.Second)
+	feed.ErrorCount = 0
+	feed.LastError = ""
+	feed.Status = "active"
+	c.saveFeed(feed)
+
+	// Save items
+	if len(items) > 0 {
+		c.saveItems(items)
+	}
+
+	return true, nil
+}