Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// simpleFeed is a lightweight feed reference used during HTML extraction
|
||||
type simpleFeed struct {
|
||||
URL string
|
||||
Type string
|
||||
}
|
||||
|
||||
func (c *Crawler) isFeedContent(body, contentType string) bool {
|
||||
if strings.Contains(contentType, "application/rss+xml") ||
|
||||
strings.Contains(contentType, "application/atom+xml") ||
|
||||
strings.Contains(contentType, "application/xml") ||
|
||||
strings.Contains(contentType, "text/xml") {
|
||||
return true
|
||||
}
|
||||
|
||||
body = strings.TrimSpace(body)
|
||||
if strings.HasPrefix(body, "<?xml") {
|
||||
if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Crawler) detectFeedType(body string) string {
|
||||
if strings.Contains(body, "<rss") {
|
||||
return "rss"
|
||||
}
|
||||
if strings.Contains(body, "<feed") {
|
||||
return "atom"
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
|
||||
feeds := make([]simpleFeed, 0)
|
||||
var f func(*html.Node)
|
||||
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "link" {
|
||||
var rel, href, typeAttr string
|
||||
for _, attr := range n.Attr {
|
||||
switch attr.Key {
|
||||
case "rel":
|
||||
rel = attr.Val
|
||||
case "href":
|
||||
href = attr.Val
|
||||
case "type":
|
||||
typeAttr = attr.Val
|
||||
}
|
||||
}
|
||||
|
||||
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
|
||||
absURL := makeAbsoluteURL(href, baseURL)
|
||||
feedType := "rss"
|
||||
if typeAttr == "application/atom+xml" {
|
||||
feedType = "atom"
|
||||
}
|
||||
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
|
||||
}
|
||||
}
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
f(child)
|
||||
}
|
||||
}
|
||||
f(n)
|
||||
return feeds
|
||||
}
|
||||
|
||||
func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []simpleFeed {
|
||||
feeds := make([]simpleFeed, 0)
|
||||
feedPattern := regexp.MustCompile(`(?i)(rss|atom|feed)`)
|
||||
|
||||
var f func(*html.Node)
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Key == "href" {
|
||||
href := attr.Val
|
||||
if feedPattern.MatchString(href) {
|
||||
absURL := makeAbsoluteURL(href, baseURL)
|
||||
feeds = append(feeds, simpleFeed{URL: absURL, Type: "unknown"})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
f(child)
|
||||
}
|
||||
}
|
||||
f(n)
|
||||
return feeds
|
||||
}
|
||||
|
||||
func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
|
||||
links := make([]string, 0)
|
||||
var f func(*html.Node)
|
||||
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Key == "href" {
|
||||
link := makeAbsoluteURL(attr.Val, baseURL)
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
}
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
f(child)
|
||||
}
|
||||
}
|
||||
f(n)
|
||||
return links
|
||||
}
|
||||
Reference in New Issue
Block a user