Add PebbleDB storage, domain tracking, and web dashboard

- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 16:29:00 -05:00
parent 0dd612b7e1
commit 219b49352e
9 changed files with 1574 additions and 642 deletions
@@ -0,0 +1,122 @@
+package main
+
+import (
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// simpleFeed is a lightweight feed reference used during HTML extraction
+type simpleFeed struct {
+	URL  string
+	Type string
+}
+
+func (c *Crawler) isFeedContent(body, contentType string) bool {
+	if strings.Contains(contentType, "application/rss+xml") ||
+		strings.Contains(contentType, "application/atom+xml") ||
+		strings.Contains(contentType, "application/xml") ||
+		strings.Contains(contentType, "text/xml") {
+		return true
+	}
+
+	body = strings.TrimSpace(body)
+	if strings.HasPrefix(body, "<?xml") {
+		if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
+			return true
+		}
+	}
+	return false
+}
+
+func (c *Crawler) detectFeedType(body string) string {
+	if strings.Contains(body, "<rss") {
+		return "rss"
+	}
+	if strings.Contains(body, "<feed") {
+		return "atom"
+	}
+	return "unknown"
+}
+
+func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
+	feeds := make([]simpleFeed, 0)
+	var f func(*html.Node)
+
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "link" {
+			var rel, href, typeAttr string
+			for _, attr := range n.Attr {
+				switch attr.Key {
+				case "rel":
+					rel = attr.Val
+				case "href":
+					href = attr.Val
+				case "type":
+					typeAttr = attr.Val
+				}
+			}
+
+			if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
+				absURL := makeAbsoluteURL(href, baseURL)
+				feedType := "rss"
+				if typeAttr == "application/atom+xml" {
+					feedType = "atom"
+				}
+				feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return feeds
+}
+
+func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []simpleFeed {
+	feeds := make([]simpleFeed, 0)
+	feedPattern := regexp.MustCompile(`(?i)(rss|atom|feed)`)
+
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, attr := range n.Attr {
+				if attr.Key == "href" {
+					href := attr.Val
+					if feedPattern.MatchString(href) {
+						absURL := makeAbsoluteURL(href, baseURL)
+						feeds = append(feeds, simpleFeed{URL: absURL, Type: "unknown"})
+					}
+				}
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return feeds
+}
+
+func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
+	links := make([]string, 0)
+	var f func(*html.Node)
+
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, attr := range n.Attr {
+				if attr.Key == "href" {
+					link := makeAbsoluteURL(attr.Val, baseURL)
+					links = append(links, link)
+				}
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return links
+}