commit f4cae127cc5989d313087a83eca04f7161e71a7c Author: primal Date: Thu Jan 22 15:15:30 2026 -0500 Add feed crawler with documentation - main.go: RSS/Atom feed crawler using Common Crawl data - CLAUDE.md: Project documentation for Claude Code - .gitignore: Ignore binary and go.* files - Feed output now written to feed/ directory Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7752a63 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +1440.news +go.* diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..4c4dd88 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,51 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +1440.news is a Go-based web feed crawler that discovers and catalogs RSS and Atom feeds from websites. It processes hosts from Common Crawl data (vertices.txt.gz) and outputs discovered feeds organized by TLD into `.feed` files. + +## Build & Run Commands + +```bash +# Build +go build -o 1440.news main.go + +# Run (requires vertices.txt.gz in the working directory) +./1440.news + +# Format code +go fmt ./... + +# Static analysis +go vet ./... +``` + +## Architecture + +**Single-file application** (`main.go`, ~656 lines) with these key components: + +- `Crawler` struct - Core engine managing HTTP client, concurrency, and state +- `Feed` struct - Simple URL + Type (rss/atom) structure +- RSS/Atom parsing structs for XML deserialization + +**Concurrency model:** +- Worker pool pattern with `runtime.NumCPU() - 1` goroutines +- `sync.Map` for thread-safe global URL deduplication +- `sync.Mutex` for feed collection and TLD file operations + +**Key functions:** +- `CrawlHosts()` - Main entry point, coordinates worker pool +- `crawlHost()` - Processes a single host (tries HTTPS then HTTP) +- `crawlPage()` - Recursive page crawler with depth/page limits +- `extractFeedLinks()` - Finds `` feed references +- `extractAnchorFeeds()` - Finds anchor tags with rss/atom/feed in href + +**Configuration (hardcoded in `NewCrawler()`):** +- MaxDepth: 10, MaxPagesPerHost: 10, Timeout: 10s +- UserAgent: "FeedCrawler/1.0" +- Max redirects: 10 + +**Input:** Common Crawl vertices file (gzipped TSV with reverse domain notation) +**Output:** TLD-specific `.feed` files (e.g., `com.feed`, `org.feed`) containing sorted, deduplicated feed URLs diff --git a/main.go b/main.go new file mode 100644 index 0000000..c9a5782 --- /dev/null +++ b/main.go @@ -0,0 +1,655 @@ +package main + +import ( + "bufio" + "compress/gzip" + "encoding/xml" + "fmt" + "io" + "math/rand" + "net/http" + "net/url" + "os" + "regexp" + "runtime" + "sort" + "strings" + "sync" + "sync/atomic" + "time" + + "golang.org/x/net/html" +) + +type Feed struct { + URL string + Type string // "rss" or "atom" +} + +// RSS structs +type RSS struct { + Channel Channel `xml:"channel"` +} + +type Channel struct { + Items []RSSItem `xml:"item"` +} + +type RSSItem struct { + Link string `xml:"link"` +} + +// Atom structs +type AtomFeed struct { + Entries []AtomEntry `xml:"entry"` +} + +type AtomEntry struct { + Links []AtomLink `xml:"link"` +} + +type AtomLink struct { + Href string `xml:"href,attr"` + Rel string `xml:"rel,attr"` +} + +type Crawler struct { + MaxDepth int + MaxPagesPerHost int + Timeout time.Duration + UserAgent string + visited sync.Map + feeds []Feed + feedsMu sync.Mutex + client *http.Client + hostsProcessed int32 + + // TLD file management + currentTLD string + tldFile *os.File + tldFeeds map[string]bool + tldMu sync.Mutex +} + +func NewCrawler() *Crawler { + return &Crawler{ + MaxDepth: 10, + MaxPagesPerHost: 10, + Timeout: 10 * time.Second, + UserAgent: "FeedCrawler/1.0", + feeds: make([]Feed, 0), + tldFeeds: make(map[string]bool), + client: &http.Client{ + Timeout: 10 * time.Second, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= 10 { + return fmt.Errorf("stopped after 10 redirects") + } + return nil + }, + }, + } +} + +// reverseHost converts a reverse domain notation back to normal +// e.g., "com.example.www" -> "www.example.com" +func reverseHost(reverseHost string) string { + parts := strings.Split(reverseHost, ".") + // Reverse the parts + for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { + parts[i], parts[j] = parts[j], parts[i] + } + return strings.Join(parts, ".") +} + +// getTLD extracts the TLD from a hostname +func getTLD(host string) string { + parts := strings.Split(host, ".") + if len(parts) > 0 { + return parts[len(parts)-1] + } + return "" +} + +func (c *Crawler) GetCommonCrawlHostsFromFile(filename string, limit int) ([]string, error) { + file, err := os.Open(filename) + if err != nil { + return nil, fmt.Errorf("failed to open file: %v", err) + } + defer file.Close() + + hosts, err := c.parseVerticesFile(file, limit) + if err != nil { + return nil, fmt.Errorf("failed to parse vertices: %v", err) + } + + // Randomize the order + rand.Shuffle(len(hosts), func(i, j int) { + hosts[i], hosts[j] = hosts[j], hosts[i] + }) + + return hosts, nil +} + +func (c *Crawler) parseVerticesFile(reader io.Reader, limit int) ([]string, error) { + // Try to detect if it's gzipped + var bodyReader io.Reader + + // Create a buffered reader so we can peek + bufReader := bufio.NewReader(reader) + peekBytes, err := bufReader.Peek(2) + if err != nil && err != io.EOF { + return nil, fmt.Errorf("failed to peek at file: %v", err) + } + + // Check for gzip magic number (0x1f 0x8b) + if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b { + gzReader, err := gzip.NewReader(bufReader) + if err != nil { + return nil, fmt.Errorf("failed to create gzip reader: %v", err) + } + defer gzReader.Close() + bodyReader = gzReader + } else { + bodyReader = bufReader + } + + hosts := make([]string, 0) + scanner := bufio.NewScanner(bodyReader) + + // Set a larger buffer for scanning + buf := make([]byte, 0, 64*1024) + scanner.Buffer(buf, 1024*1024) + + count := 0 + for scanner.Scan() { + if limit > 0 && count >= limit { + break + } + + line := scanner.Text() + // Vertices file format: line_number\treverse_hostname\tinteger + // Example: 0\tcom.example\t42 + parts := strings.Split(line, "\t") + if len(parts) >= 2 { + reverseHostName := strings.TrimSpace(parts[1]) + if reverseHostName != "" { + // Convert from reverse notation (com.example) to normal (example.com) + normalHost := reverseHost(reverseHostName) + hosts = append(hosts, normalHost) + count++ + } + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading file: %v", err) + } + + return hosts, nil +} + +func (c *Crawler) openTLDFile(tld string) error { + // Close previous file if open + if c.tldFile != nil { + c.sortAndDeduplicateTLDFile() + c.tldFile.Close() + c.tldFile = nil + c.tldFeeds = make(map[string]bool) + } + + // Open new file + if tld != "" { + filename := "feed/" + tld + ".feed" + file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("failed to open TLD file %s: %v", filename, err) + } + c.tldFile = file + c.currentTLD = tld + } + + return nil +} + +func (c *Crawler) sortAndDeduplicateTLDFile() { + if c.currentTLD == "" { + return + } + + filename := "feed/" + c.currentTLD + ".feed" + + // Read all lines from the file + file, err := os.Open(filename) + if err != nil { + return + } + + feedSet := make(map[string]bool) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" { + feedSet[line] = true + } + } + file.Close() + + // Sort the unique feeds + feeds := make([]string, 0, len(feedSet)) + for feed := range feedSet { + feeds = append(feeds, feed) + } + sort.Strings(feeds) + + // Write back to file + file, err = os.Create(filename) + if err != nil { + return + } + defer file.Close() + + writer := bufio.NewWriter(file) + for _, feed := range feeds { + writer.WriteString(feed + "\n") + } + writer.Flush() +} + +func (c *Crawler) writeFeedToTLDFile(feedURL, host string) { + c.tldMu.Lock() + defer c.tldMu.Unlock() + + tld := getTLD(host) + + // Check if TLD changed + if tld != c.currentTLD { + c.openTLDFile(tld) + } + + // Write feed to file if not already written + if c.tldFile != nil && !c.tldFeeds[feedURL] { + c.tldFile.WriteString(feedURL + "\n") + c.tldFeeds[feedURL] = true + } +} + +func (c *Crawler) Crawl(startURL string) ([]Feed, error) { + pagesVisited := 0 + c.crawlPage(startURL, 0, make(map[string]bool), &pagesVisited) + return c.feeds, nil +} + +func (c *Crawler) CrawlHosts(hosts []string) ([]Feed, error) { + numWorkers := runtime.NumCPU() - 1 + if numWorkers < 1 { + numWorkers = 1 + } + + hostChan := make(chan string, numWorkers*2) + var wg sync.WaitGroup + + // Start workers + for i := 0; i < numWorkers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for host := range hostChan { + c.crawlHost(host) + } + }() + } + + // Send hosts to workers + for _, host := range hosts { + hostChan <- host + } + + close(hostChan) + wg.Wait() + + // Close final TLD file + c.tldMu.Lock() + c.openTLDFile("") + c.tldMu.Unlock() + + return c.feeds, nil +} + +func (c *Crawler) crawlHost(host string) { + atomic.AddInt32(&c.hostsProcessed, 1) + + hostFeeds := make([]Feed, 0) + localVisited := make(map[string]bool) + pagesVisited := 0 + + // Try both http and https + urls := []string{ + "https://" + host, + "http://" + host, + } + + for _, url := range urls { + c.crawlPage(url, 0, localVisited, &pagesVisited) + break // If first succeeds, don't try second + } + + // Collect feeds found for this host + c.feedsMu.Lock() + for _, feed := range c.feeds { + // Check if feed belongs to this host + feedHost := "" + if u, err := url.Parse(feed.URL); err == nil { + feedHost = u.Host + } + if feedHost == host || strings.HasSuffix(feedHost, "."+host) { + hostFeeds = append(hostFeeds, feed) + } + } + c.feedsMu.Unlock() + + // Print and write feeds found for this host + if len(hostFeeds) > 0 { + for _, feed := range hostFeeds { + fmt.Printf("%s\n", feed.URL) + c.writeFeedToTLDFile(feed.URL, host) + } + } +} + +func (c *Crawler) crawlPage(pageURL string, depth int, localVisited map[string]bool, pagesVisited *int) { + if *pagesVisited >= c.MaxPagesPerHost || depth > c.MaxDepth { + return + } + + if localVisited[pageURL] { + return + } + + // Check global visited + if _, visited := c.visited.LoadOrStore(pageURL, true); visited { + return + } + + localVisited[pageURL] = true + *pagesVisited++ + + body, contentType, err := c.fetchPage(pageURL) + if err != nil { + return + } + + // Check if this page itself is a feed + if c.isFeedContent(body, contentType) { + feedType := c.detectFeedType(body) + c.addFeed(pageURL, feedType) + + // Extract links from the feed and crawl them + feedLinks := c.extractLinksFromFeed(body, feedType) + + for _, link := range feedLinks { + c.crawlPage(link, depth+1, localVisited, pagesVisited) + } + return + } + + // Parse HTML and look for feed links + doc, err := html.Parse(strings.NewReader(body)) + if err != nil { + return + } + + // Find feed links in tags + feedLinks := c.extractFeedLinks(doc, pageURL) + for _, feed := range feedLinks { + c.addFeed(feed.URL, feed.Type) + } + + // Find feed links in anchor tags + anchorFeeds := c.extractAnchorFeeds(doc, pageURL) + for _, feed := range anchorFeeds { + c.addFeed(feed.URL, feed.Type) + } + + // Extract all links for further crawling + if depth < c.MaxDepth { + links := c.extractLinks(doc, pageURL) + for _, link := range links { + if c.shouldCrawl(link, pageURL) { + c.crawlPage(link, depth+1, localVisited, pagesVisited) + } + } + } +} + +func (c *Crawler) fetchPage(pageURL string) (string, string, error) { + req, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + return "", "", err + } + req.Header.Set("User-Agent", c.UserAgent) + + resp, err := c.client.Do(req) + if err != nil { + return "", "", err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", "", fmt.Errorf("status code: %d", resp.StatusCode) + } + + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return "", "", err + } + + contentType := resp.Header.Get("Content-Type") + return string(bodyBytes), contentType, nil +} + +func (c *Crawler) isFeedContent(body, contentType string) bool { + if strings.Contains(contentType, "application/rss+xml") || + strings.Contains(contentType, "application/atom+xml") || + strings.Contains(contentType, "application/xml") || + strings.Contains(contentType, "text/xml") { + return true + } + + body = strings.TrimSpace(body) + if strings.HasPrefix(body, "