Add PebbleDB storage, domain tracking, and web dashboard

- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 16:29:00 -05:00
parent 0dd612b7e1
commit 219b49352e
9 changed files with 1574 additions and 642 deletions
@@ -1,655 +1,30 @@
 package main

 import (
-	"bufio"
-	"compress/gzip"
-	"encoding/xml"
 	"fmt"
-	"io"
-	"math/rand"
-	"net/http"
-	"net/url"
 	"os"
-	"regexp"
-	"runtime"
-	"sort"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	"golang.org/x/net/html"
 )

-type Feed struct {
-	URL  string
-	Type string // "rss" or "atom"
-}
-
-// RSS structs
-type RSS struct {
-	Channel Channel `xml:"channel"`
-}
-
-type Channel struct {
-	Items []RSSItem `xml:"item"`
-}
-
-type RSSItem struct {
-	Link string `xml:"link"`
-}
-
-// Atom structs
-type AtomFeed struct {
-	Entries []AtomEntry `xml:"entry"`
-}
-
-type AtomEntry struct {
-	Links []AtomLink `xml:"link"`
-}
-
-type AtomLink struct {
-	Href string `xml:"href,attr"`
-	Rel  string `xml:"rel,attr"`
-}
-
-type Crawler struct {
-	MaxDepth      int
-	MaxPagesPerHost int
-	Timeout       time.Duration
-	UserAgent     string
-	visited       sync.Map
-	feeds         []Feed
-	feedsMu       sync.Mutex
-	client        *http.Client
-	hostsProcessed int32
-	
-	// TLD file management
-	currentTLD   string
-	tldFile      *os.File
-	tldFeeds     map[string]bool
-	tldMu        sync.Mutex
-}
-
-func NewCrawler() *Crawler {
-	return &Crawler{
-		MaxDepth:        10,
-		MaxPagesPerHost: 10,
-		Timeout:         10 * time.Second,
-		UserAgent:       "FeedCrawler/1.0",
-		feeds:           make([]Feed, 0),
-		tldFeeds:        make(map[string]bool),
-		client: &http.Client{
-			Timeout: 10 * time.Second,
-			CheckRedirect: func(req *http.Request, via []*http.Request) error {
-				if len(via) >= 10 {
-					return fmt.Errorf("stopped after 10 redirects")
-				}
-				return nil
-			},
-		},
-	}
-}
-
-// reverseHost converts a reverse domain notation back to normal
-// e.g., "com.example.www" -> "www.example.com"
-func reverseHost(reverseHost string) string {
-	parts := strings.Split(reverseHost, ".")
-	// Reverse the parts
-	for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
-		parts[i], parts[j] = parts[j], parts[i]
-	}
-	return strings.Join(parts, ".")
-}
-
-// getTLD extracts the TLD from a hostname
-func getTLD(host string) string {
-	parts := strings.Split(host, ".")
-	if len(parts) > 0 {
-		return parts[len(parts)-1]
-	}
-	return ""
-}
-
-func (c *Crawler) GetCommonCrawlHostsFromFile(filename string, limit int) ([]string, error) {
-	file, err := os.Open(filename)
-	if err != nil {
-		return nil, fmt.Errorf("failed to open file: %v", err)
-	}
-	defer file.Close()
-
-	hosts, err := c.parseVerticesFile(file, limit)
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse vertices: %v", err)
-	}
-
-	// Randomize the order
-	rand.Shuffle(len(hosts), func(i, j int) {
-		hosts[i], hosts[j] = hosts[j], hosts[i]
-	})
-
-	return hosts, nil
-}
-
-func (c *Crawler) parseVerticesFile(reader io.Reader, limit int) ([]string, error) {
-	// Try to detect if it's gzipped
-	var bodyReader io.Reader
-	
-	// Create a buffered reader so we can peek
-	bufReader := bufio.NewReader(reader)
-	peekBytes, err := bufReader.Peek(2)
-	if err != nil && err != io.EOF {
-		return nil, fmt.Errorf("failed to peek at file: %v", err)
-	}
-
-	// Check for gzip magic number (0x1f 0x8b)
-	if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
-		gzReader, err := gzip.NewReader(bufReader)
-		if err != nil {
-			return nil, fmt.Errorf("failed to create gzip reader: %v", err)
-		}
-		defer gzReader.Close()
-		bodyReader = gzReader
-	} else {
-		bodyReader = bufReader
-	}
-
-	hosts := make([]string, 0)
-	scanner := bufio.NewScanner(bodyReader)
-	
-	// Set a larger buffer for scanning
-	buf := make([]byte, 0, 64*1024)
-	scanner.Buffer(buf, 1024*1024)
-
-	count := 0
-	for scanner.Scan() {
-		if limit > 0 && count >= limit {
-			break
-		}
-
-		line := scanner.Text()
-		// Vertices file format: line_number\treverse_hostname\tinteger
-		// Example: 0\tcom.example\t42
-		parts := strings.Split(line, "\t")
-		if len(parts) >= 2 {
-			reverseHostName := strings.TrimSpace(parts[1])
-			if reverseHostName != "" {
-				// Convert from reverse notation (com.example) to normal (example.com)
-				normalHost := reverseHost(reverseHostName)
-				hosts = append(hosts, normalHost)
-				count++
-			}
-		}
-	}
-
-	if err := scanner.Err(); err != nil {
-		return nil, fmt.Errorf("error reading file: %v", err)
-	}
-
-	return hosts, nil
-}
-
-func (c *Crawler) openTLDFile(tld string) error {
-	// Close previous file if open
-	if c.tldFile != nil {
-		c.sortAndDeduplicateTLDFile()
-		c.tldFile.Close()
-		c.tldFile = nil
-		c.tldFeeds = make(map[string]bool)
-	}
-	
-	// Open new file
-	if tld != "" {
-		filename := "feeds/" + tld + ".feed"
-		file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
-		if err != nil {
-			return fmt.Errorf("failed to open TLD file %s: %v", filename, err)
-		}
-		c.tldFile = file
-		c.currentTLD = tld
-	}
-	
-	return nil
-}
-
-func (c *Crawler) sortAndDeduplicateTLDFile() {
-	if c.currentTLD == "" {
-		return
-	}
-
-	filename := "feeds/" + c.currentTLD + ".feed"
-	
-	// Read all lines from the file
-	file, err := os.Open(filename)
-	if err != nil {
-		return
-	}
-	
-	feedSet := make(map[string]bool)
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		line := strings.TrimSpace(scanner.Text())
-		if line != "" {
-			feedSet[line] = true
-		}
-	}
-	file.Close()
-	
-	// Sort the unique feeds
-	feeds := make([]string, 0, len(feedSet))
-	for feed := range feedSet {
-		feeds = append(feeds, feed)
-	}
-	sort.Strings(feeds)
-	
-	// Write back to file
-	file, err = os.Create(filename)
-	if err != nil {
-		return
-	}
-	defer file.Close()
-	
-	writer := bufio.NewWriter(file)
-	for _, feed := range feeds {
-		writer.WriteString(feed + "\n")
-	}
-	writer.Flush()
-}
-
-func (c *Crawler) writeFeedToTLDFile(feedURL, host string) {
-	c.tldMu.Lock()
-	defer c.tldMu.Unlock()
-	
-	tld := getTLD(host)
-	
-	// Check if TLD changed
-	if tld != c.currentTLD {
-		c.openTLDFile(tld)
-	}
-	
-	// Write feed to file if not already written
-	if c.tldFile != nil && !c.tldFeeds[feedURL] {
-		c.tldFile.WriteString(feedURL + "\n")
-		c.tldFeeds[feedURL] = true
-	}
-}
-
-func (c *Crawler) Crawl(startURL string) ([]Feed, error) {
-	pagesVisited := 0
-	c.crawlPage(startURL, 0, make(map[string]bool), &pagesVisited)
-	return c.feeds, nil
-}
-
-func (c *Crawler) CrawlHosts(hosts []string) ([]Feed, error) {
-	numWorkers := runtime.NumCPU() - 1
-	if numWorkers < 1 {
-		numWorkers = 1
-	}
-	
-	hostChan := make(chan string, numWorkers*2)
-	var wg sync.WaitGroup
-	
-	// Start workers
-	for i := 0; i < numWorkers; i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			for host := range hostChan {
-				c.crawlHost(host)
-			}
-		}()
-	}
-	
-	// Send hosts to workers
-	for _, host := range hosts {
-		hostChan <- host
-	}
-	
-	close(hostChan)
-	wg.Wait()
-	
-	// Close final TLD file
-	c.tldMu.Lock()
-	c.openTLDFile("")
-	c.tldMu.Unlock()
-	
-	return c.feeds, nil
-}
-
-func (c *Crawler) crawlHost(host string) {
-	atomic.AddInt32(&c.hostsProcessed, 1)
-	
-	hostFeeds := make([]Feed, 0)
-	localVisited := make(map[string]bool)
-	pagesVisited := 0
-
-	// Try both http and https
-	urls := []string{
-		"https://" + host,
-		"http://" + host,
-	}
-
-	for _, url := range urls {
-		c.crawlPage(url, 0, localVisited, &pagesVisited)
-		break // If first succeeds, don't try second
-	}
-
-	// Collect feeds found for this host
-	c.feedsMu.Lock()
-	for _, feed := range c.feeds {
-		// Check if feed belongs to this host
-		feedHost := ""
-		if u, err := url.Parse(feed.URL); err == nil {
-			feedHost = u.Host
-		}
-		if feedHost == host || strings.HasSuffix(feedHost, "."+host) {
-			hostFeeds = append(hostFeeds, feed)
-		}
-	}
-	c.feedsMu.Unlock()
-	
-	// Print and write feeds found for this host
-	if len(hostFeeds) > 0 {
-		for _, feed := range hostFeeds {
-			fmt.Printf("%s\n", feed.URL)
-			c.writeFeedToTLDFile(feed.URL, host)
-		}
-	}
-}
-
-func (c *Crawler) crawlPage(pageURL string, depth int, localVisited map[string]bool, pagesVisited *int) {
-	if *pagesVisited >= c.MaxPagesPerHost || depth > c.MaxDepth {
-		return
-	}
-	
-	if localVisited[pageURL] {
-		return
-	}
-	
-	// Check global visited
-	if _, visited := c.visited.LoadOrStore(pageURL, true); visited {
-		return
-	}
-	
-	localVisited[pageURL] = true
-	*pagesVisited++
-
-	body, contentType, err := c.fetchPage(pageURL)
-	if err != nil {
-		return
-	}
-
-	// Check if this page itself is a feed
-	if c.isFeedContent(body, contentType) {
-		feedType := c.detectFeedType(body)
-		c.addFeed(pageURL, feedType)
-		
-		// Extract links from the feed and crawl them
-		feedLinks := c.extractLinksFromFeed(body, feedType)
-		
-		for _, link := range feedLinks {
-			c.crawlPage(link, depth+1, localVisited, pagesVisited)
-		}
-		return
-	}
-
-	// Parse HTML and look for feed links
-	doc, err := html.Parse(strings.NewReader(body))
-	if err != nil {
-		return
-	}
-
-	// Find feed links in <link> tags
-	feedLinks := c.extractFeedLinks(doc, pageURL)
-	for _, feed := range feedLinks {
-		c.addFeed(feed.URL, feed.Type)
-	}
-
-	// Find feed links in anchor tags
-	anchorFeeds := c.extractAnchorFeeds(doc, pageURL)
-	for _, feed := range anchorFeeds {
-		c.addFeed(feed.URL, feed.Type)
-	}
-
-	// Extract all links for further crawling
-	if depth < c.MaxDepth {
-		links := c.extractLinks(doc, pageURL)
-		for _, link := range links {
-			if c.shouldCrawl(link, pageURL) {
-				c.crawlPage(link, depth+1, localVisited, pagesVisited)
-			}
-		}
-	}
-}
-
-func (c *Crawler) fetchPage(pageURL string) (string, string, error) {
-	req, err := http.NewRequest("GET", pageURL, nil)
-	if err != nil {
-		return "", "", err
-	}
-	req.Header.Set("User-Agent", c.UserAgent)
-
-	resp, err := c.client.Do(req)
-	if err != nil {
-		return "", "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return "", "", fmt.Errorf("status code: %d", resp.StatusCode)
-	}
-
-	bodyBytes, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", "", err
-	}
-
-	contentType := resp.Header.Get("Content-Type")
-	return string(bodyBytes), contentType, nil
-}
-
-func (c *Crawler) isFeedContent(body, contentType string) bool {
-	if strings.Contains(contentType, "application/rss+xml") ||
-		strings.Contains(contentType, "application/atom+xml") ||
-		strings.Contains(contentType, "application/xml") ||
-		strings.Contains(contentType, "text/xml") {
-		return true
-	}
-
-	body = strings.TrimSpace(body)
-	if strings.HasPrefix(body, "<?xml") {
-		if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
-			return true
-		}
-	}
-	return false
-}
-
-func (c *Crawler) detectFeedType(body string) string {
-	if strings.Contains(body, "<rss") {
-		return "rss"
-	}
-	if strings.Contains(body, "<feed") && strings.Contains(body, "xmlns=\"http://www.w3.org/2005/Atom\"") {
-		return "atom"
-	}
-	return "unknown"
-}
-
-func (c *Crawler) extractLinksFromFeed(body, feedType string) []string {
-	links := make([]string, 0)
-	
-	switch feedType {
-	case "rss":
-		var rss RSS
-		if err := xml.Unmarshal([]byte(body), &rss); err != nil {
-			return links
-		}
-		
-		for _, item := range rss.Channel.Items {
-			if item.Link != "" {
-				links = append(links, item.Link)
-			}
-		}
-		
-	case "atom":
-		var atom AtomFeed
-		if err := xml.Unmarshal([]byte(body), &atom); err != nil {
-			return links
-		}
-		
-		for _, entry := range atom.Entries {
-			for _, link := range entry.Links {
-				if link.Rel == "" || link.Rel == "alternate" {
-					if link.Href != "" {
-						links = append(links, link.Href)
-					}
-				}
-			}
-		}
-	}
-	
-	return links
-}
-
-func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []Feed {
-	feeds := make([]Feed, 0)
-	var f func(*html.Node)
-	
-	f = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == "link" {
-			var rel, href, typeAttr string
-			for _, attr := range n.Attr {
-				switch attr.Key {
-				case "rel":
-					rel = attr.Val
-				case "href":
-					href = attr.Val
-				case "type":
-					typeAttr = attr.Val
-				}
-			}
-
-			if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
-				absURL := c.makeAbsoluteURL(href, baseURL)
-				feedType := "rss"
-				if typeAttr == "application/atom+xml" {
-					feedType = "atom"
-				}
-				feeds = append(feeds, Feed{URL: absURL, Type: feedType})
-			}
-		}
-		for child := n.FirstChild; child != nil; child = child.NextSibling {
-			f(child)
-		}
-	}
-	f(n)
-	return feeds
-}
-
-func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []Feed {
-	feeds := make([]Feed, 0)
-	feedPattern := regexp.MustCompile(`(?i)(rss|atom|feed)`)
-	
-	var f func(*html.Node)
-	f = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == "a" {
-			for _, attr := range n.Attr {
-				if attr.Key == "href" {
-					href := attr.Val
-					if feedPattern.MatchString(href) {
-						absURL := c.makeAbsoluteURL(href, baseURL)
-						feeds = append(feeds, Feed{URL: absURL, Type: "unknown"})
-					}
-				}
-			}
-		}
-		for child := n.FirstChild; child != nil; child = child.NextSibling {
-			f(child)
-		}
-	}
-	f(n)
-	return feeds
-}
-
-func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
-	links := make([]string, 0)
-	var f func(*html.Node)
-	
-	f = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == "a" {
-			for _, attr := range n.Attr {
-				if attr.Key == "href" {
-					link := c.makeAbsoluteURL(attr.Val, baseURL)
-					links = append(links, link)
-				}
-			}
-		}
-		for child := n.FirstChild; child != nil; child = child.NextSibling {
-			f(child)
-		}
-	}
-	f(n)
-	return links
-}
-
-func (c *Crawler) makeAbsoluteURL(href, baseURL string) string {
-	base, err := url.Parse(baseURL)
-	if err != nil {
-		return href
-	}
-	
-	link, err := url.Parse(href)
-	if err != nil {
-		return href
-	}
-	
-	return base.ResolveReference(link).String()
-}
-
-func (c *Crawler) shouldCrawl(link, baseURL string) bool {
-	linkURL, err := url.Parse(link)
-	if err != nil {
-		return false
-	}
-	
-	baseURLParsed, err := url.Parse(baseURL)
-	if err != nil {
-		return false
-	}
-	
-	return linkURL.Host == baseURLParsed.Host
-}
-
-func (c *Crawler) addFeed(feedURL, feedType string) {
-	c.feedsMu.Lock()
-	defer c.feedsMu.Unlock()
-	
-	for _, f := range c.feeds {
-		if f.URL == feedURL {
-			return
-		}
-	}
-	
-	feed := Feed{URL: feedURL, Type: feedType}
-	c.feeds = append(c.feeds, feed)
-}
-
 func main() {
-	rand.Seed(time.Now().UnixNano())
-	
-	crawler := NewCrawler()
-
-	hosts, err := crawler.GetCommonCrawlHostsFromFile("vertices.txt.gz", 0)
+	crawler, err := NewCrawler("feeds.db")
 	if err != nil {
-		fmt.Printf("Error reading hosts from file: %v\n", err)
-		return
+		fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
+		os.Exit(1)
 	}
+	defer crawler.Close()

-	feeds, err := crawler.CrawlHosts(hosts)
-	if err != nil {
-		fmt.Printf("Error: %v\n", err)
-		return
+	// Start dashboard in background
+	go func() {
+		if err := crawler.StartDashboard("0.0.0.0:4321"); err != nil {
+			fmt.Fprintf(os.Stderr, "Dashboard error: %v\n", err)
+		}
+	}()
+
+	// Import domains from vertices file (only adds new ones as "uncrawled")
+	crawler.ImportDomainsFromFile("vertices.txt.gz", 0)
+
+	// Crawl all uncrawled domains (runs continuously)
+	for {
+		crawler.CrawlUncrawledDomains()
 	}
-
-	fmt.Printf("=== Total feeds found: %d ===\n", len(feeds))
 }