- Replace StartCDXImportLoop with StartCDXMonthlyLoop (runs on 1st of month) - Enable StartFeedCheckLoop, StartCleanupLoop, StartMaintenanceLoop - Remove domain check/crawl loops (CDX provides verified feeds) - Remove vertices.txt import functions (CDX is now sole feed source) - Remove HTML extraction functions (extractFeedLinks, extractAnchorFeeds, etc.) - Remove unused helpers (shouldCrawl, makeAbsoluteURL, GetFeedCountByHost) - Simplify Crawler struct (remove MaxDepth, visited, domain counters) - Add CDX progress tracking to database and dashboard - Net removal: ~720 lines of unused code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
21 lines
418 B
Go
21 lines
418 B
Go
package main
|
|
|
|
import (
|
|
"strings"
|
|
)
|
|
|
|
func (c *Crawler) detectFeedType(body string) string {
|
|
if strings.Contains(body, "<rss") {
|
|
return "rss"
|
|
}
|
|
if strings.Contains(body, "<feed") {
|
|
return "atom"
|
|
}
|
|
// Check for JSON Feed (version field contains jsonfeed.org URL)
|
|
body = strings.TrimSpace(body)
|
|
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
|
|
return "json"
|
|
}
|
|
return "unknown"
|
|
}
|