Files
crawler/html.go
primal 3b1b12ff70 Simplify crawler: use CDX for feed discovery, remove unused loops
- Replace StartCDXImportLoop with StartCDXMonthlyLoop (runs on 1st of month)
- Enable StartFeedCheckLoop, StartCleanupLoop, StartMaintenanceLoop
- Remove domain check/crawl loops (CDX provides verified feeds)
- Remove vertices.txt import functions (CDX is now sole feed source)
- Remove HTML extraction functions (extractFeedLinks, extractAnchorFeeds, etc.)
- Remove unused helpers (shouldCrawl, makeAbsoluteURL, GetFeedCountByHost)
- Simplify Crawler struct (remove MaxDepth, visited, domain counters)
- Add CDX progress tracking to database and dashboard
- Net removal: ~720 lines of unused code

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 22:53:23 -05:00

21 lines
418 B
Go

package main
import (
"strings"
)
func (c *Crawler) detectFeedType(body string) string {
if strings.Contains(body, "<rss") {
return "rss"
}
if strings.Contains(body, "<feed") {
return "atom"
}
// Check for JSON Feed (version field contains jsonfeed.org URL)
body = strings.TrimSpace(body)
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
return "json"
}
return "unknown"
}