commit f4cae127cc5989d313087a83eca04f7161e71a7c
Author: primal <primal@primal.host>
Date:   Thu Jan 22 15:15:30 2026 -0500

    Add feed crawler with documentation
    
    - main.go: RSS/Atom feed crawler using Common Crawl data
    - CLAUDE.md: Project documentation for Claude Code
    - .gitignore: Ignore binary and go.* files
    - Feed output now written to feed/ directory
    
    Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7752a63
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+1440.news
+go.*
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..4c4dd88
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,51 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+1440.news is a Go-based web feed crawler that discovers and catalogs RSS and Atom feeds from websites. It processes hosts from Common Crawl data (vertices.txt.gz) and outputs discovered feeds organized by TLD into `.feed` files.
+
+## Build & Run Commands
+
+```bash
+# Build
+go build -o 1440.news main.go
+
+# Run (requires vertices.txt.gz in the working directory)
+./1440.news
+
+# Format code
+go fmt ./...
+
+# Static analysis
+go vet ./...
+```
+
+## Architecture
+
+**Single-file application** (`main.go`, ~656 lines) with these key components:
+
+- `Crawler` struct - Core engine managing HTTP client, concurrency, and state
+- `Feed` struct - Simple URL + Type (rss/atom) structure
+- RSS/Atom parsing structs for XML deserialization
+
+**Concurrency model:**
+- Worker pool pattern with `runtime.NumCPU() - 1` goroutines
+- `sync.Map` for thread-safe global URL deduplication
+- `sync.Mutex` for feed collection and TLD file operations
+
+**Key functions:**
+- `CrawlHosts()` - Main entry point, coordinates worker pool
+- `crawlHost()` - Processes a single host (tries HTTPS then HTTP)
+- `crawlPage()` - Recursive page crawler with depth/page limits
+- `extractFeedLinks()` - Finds `<link rel="alternate">` feed references
+- `extractAnchorFeeds()` - Finds anchor tags with rss/atom/feed in href
+
+**Configuration (hardcoded in `NewCrawler()`):**
+- MaxDepth: 10, MaxPagesPerHost: 10, Timeout: 10s
+- UserAgent: "FeedCrawler/1.0"
+- Max redirects: 10
+
+**Input:** Common Crawl vertices file (gzipped TSV with reverse domain notation)
+**Output:** TLD-specific `.feed` files (e.g., `com.feed`, `org.feed`) containing sorted, deduplicated feed URLs
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..c9a5782
--- /dev/null
+++ b/main.go
@@ -0,0 +1,655 @@
+package main
+
+import (
+	"bufio"
+	"compress/gzip"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"math/rand"
+	"net/http"
+	"net/url"
+	"os"
+	"regexp"
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"golang.org/x/net/html"
+)
+
+type Feed struct {
+	URL  string
+	Type string // "rss" or "atom"
+}
+
+// RSS structs
+type RSS struct {
+	Channel Channel `xml:"channel"`
+}
+
+type Channel struct {
+	Items []RSSItem `xml:"item"`
+}
+
+type RSSItem struct {
+	Link string `xml:"link"`
+}
+
+// Atom structs
+type AtomFeed struct {
+	Entries []AtomEntry `xml:"entry"`
+}
+
+type AtomEntry struct {
+	Links []AtomLink `xml:"link"`
+}
+
+type AtomLink struct {
+	Href string `xml:"href,attr"`
+	Rel  string `xml:"rel,attr"`
+}
+
+type Crawler struct {
+	MaxDepth      int
+	MaxPagesPerHost int
+	Timeout       time.Duration
+	UserAgent     string
+	visited       sync.Map
+	feeds         []Feed
+	feedsMu       sync.Mutex
+	client        *http.Client
+	hostsProcessed int32
+	
+	// TLD file management
+	currentTLD   string
+	tldFile      *os.File
+	tldFeeds     map[string]bool
+	tldMu        sync.Mutex
+}
+
+func NewCrawler() *Crawler {
+	return &Crawler{
+		MaxDepth:        10,
+		MaxPagesPerHost: 10,
+		Timeout:         10 * time.Second,
+		UserAgent:       "FeedCrawler/1.0",
+		feeds:           make([]Feed, 0),
+		tldFeeds:        make(map[string]bool),
+		client: &http.Client{
+			Timeout: 10 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("stopped after 10 redirects")
+				}
+				return nil
+			},
+		},
+	}
+}
+
+// reverseHost converts a reverse domain notation back to normal
+// e.g., "com.example.www" -> "www.example.com"
+func reverseHost(reverseHost string) string {
+	parts := strings.Split(reverseHost, ".")
+	// Reverse the parts
+	for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
+		parts[i], parts[j] = parts[j], parts[i]
+	}
+	return strings.Join(parts, ".")
+}
+
+// getTLD extracts the TLD from a hostname
+func getTLD(host string) string {
+	parts := strings.Split(host, ".")
+	if len(parts) > 0 {
+		return parts[len(parts)-1]
+	}
+	return ""
+}
+
+func (c *Crawler) GetCommonCrawlHostsFromFile(filename string, limit int) ([]string, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open file: %v", err)
+	}
+	defer file.Close()
+
+	hosts, err := c.parseVerticesFile(file, limit)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse vertices: %v", err)
+	}
+
+	// Randomize the order
+	rand.Shuffle(len(hosts), func(i, j int) {
+		hosts[i], hosts[j] = hosts[j], hosts[i]
+	})
+
+	return hosts, nil
+}
+
+func (c *Crawler) parseVerticesFile(reader io.Reader, limit int) ([]string, error) {
+	// Try to detect if it's gzipped
+	var bodyReader io.Reader
+	
+	// Create a buffered reader so we can peek
+	bufReader := bufio.NewReader(reader)
+	peekBytes, err := bufReader.Peek(2)
+	if err != nil && err != io.EOF {
+		return nil, fmt.Errorf("failed to peek at file: %v", err)
+	}
+
+	// Check for gzip magic number (0x1f 0x8b)
+	if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
+		gzReader, err := gzip.NewReader(bufReader)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create gzip reader: %v", err)
+		}
+		defer gzReader.Close()
+		bodyReader = gzReader
+	} else {
+		bodyReader = bufReader
+	}
+
+	hosts := make([]string, 0)
+	scanner := bufio.NewScanner(bodyReader)
+	
+	// Set a larger buffer for scanning
+	buf := make([]byte, 0, 64*1024)
+	scanner.Buffer(buf, 1024*1024)
+
+	count := 0
+	for scanner.Scan() {
+		if limit > 0 && count >= limit {
+			break
+		}
+
+		line := scanner.Text()
+		// Vertices file format: line_number\treverse_hostname\tinteger
+		// Example: 0\tcom.example\t42
+		parts := strings.Split(line, "\t")
+		if len(parts) >= 2 {
+			reverseHostName := strings.TrimSpace(parts[1])
+			if reverseHostName != "" {
+				// Convert from reverse notation (com.example) to normal (example.com)
+				normalHost := reverseHost(reverseHostName)
+				hosts = append(hosts, normalHost)
+				count++
+			}
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading file: %v", err)
+	}
+
+	return hosts, nil
+}
+
+func (c *Crawler) openTLDFile(tld string) error {
+	// Close previous file if open
+	if c.tldFile != nil {
+		c.sortAndDeduplicateTLDFile()
+		c.tldFile.Close()
+		c.tldFile = nil
+		c.tldFeeds = make(map[string]bool)
+	}
+	
+	// Open new file
+	if tld != "" {
+		filename := "feed/" + tld + ".feed"
+		file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return fmt.Errorf("failed to open TLD file %s: %v", filename, err)
+		}
+		c.tldFile = file
+		c.currentTLD = tld
+	}
+	
+	return nil
+}
+
+func (c *Crawler) sortAndDeduplicateTLDFile() {
+	if c.currentTLD == "" {
+		return
+	}
+
+	filename := "feed/" + c.currentTLD + ".feed"
+	
+	// Read all lines from the file
+	file, err := os.Open(filename)
+	if err != nil {
+		return
+	}
+	
+	feedSet := make(map[string]bool)
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line != "" {
+			feedSet[line] = true
+		}
+	}
+	file.Close()
+	
+	// Sort the unique feeds
+	feeds := make([]string, 0, len(feedSet))
+	for feed := range feedSet {
+		feeds = append(feeds, feed)
+	}
+	sort.Strings(feeds)
+	
+	// Write back to file
+	file, err = os.Create(filename)
+	if err != nil {
+		return
+	}
+	defer file.Close()
+	
+	writer := bufio.NewWriter(file)
+	for _, feed := range feeds {
+		writer.WriteString(feed + "\n")
+	}
+	writer.Flush()
+}
+
+func (c *Crawler) writeFeedToTLDFile(feedURL, host string) {
+	c.tldMu.Lock()
+	defer c.tldMu.Unlock()
+	
+	tld := getTLD(host)
+	
+	// Check if TLD changed
+	if tld != c.currentTLD {
+		c.openTLDFile(tld)
+	}
+	
+	// Write feed to file if not already written
+	if c.tldFile != nil && !c.tldFeeds[feedURL] {
+		c.tldFile.WriteString(feedURL + "\n")
+		c.tldFeeds[feedURL] = true
+	}
+}
+
+func (c *Crawler) Crawl(startURL string) ([]Feed, error) {
+	pagesVisited := 0
+	c.crawlPage(startURL, 0, make(map[string]bool), &pagesVisited)
+	return c.feeds, nil
+}
+
+func (c *Crawler) CrawlHosts(hosts []string) ([]Feed, error) {
+	numWorkers := runtime.NumCPU() - 1
+	if numWorkers < 1 {
+		numWorkers = 1
+	}
+	
+	hostChan := make(chan string, numWorkers*2)
+	var wg sync.WaitGroup
+	
+	// Start workers
+	for i := 0; i < numWorkers; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for host := range hostChan {
+				c.crawlHost(host)
+			}
+		}()
+	}
+	
+	// Send hosts to workers
+	for _, host := range hosts {
+		hostChan <- host
+	}
+	
+	close(hostChan)
+	wg.Wait()
+	
+	// Close final TLD file
+	c.tldMu.Lock()
+	c.openTLDFile("")
+	c.tldMu.Unlock()
+	
+	return c.feeds, nil
+}
+
+func (c *Crawler) crawlHost(host string) {
+	atomic.AddInt32(&c.hostsProcessed, 1)
+	
+	hostFeeds := make([]Feed, 0)
+	localVisited := make(map[string]bool)
+	pagesVisited := 0
+
+	// Try both http and https
+	urls := []string{
+		"https://" + host,
+		"http://" + host,
+	}
+
+	for _, url := range urls {
+		c.crawlPage(url, 0, localVisited, &pagesVisited)
+		break // If first succeeds, don't try second
+	}
+
+	// Collect feeds found for this host
+	c.feedsMu.Lock()
+	for _, feed := range c.feeds {
+		// Check if feed belongs to this host
+		feedHost := ""
+		if u, err := url.Parse(feed.URL); err == nil {
+			feedHost = u.Host
+		}
+		if feedHost == host || strings.HasSuffix(feedHost, "."+host) {
+			hostFeeds = append(hostFeeds, feed)
+		}
+	}
+	c.feedsMu.Unlock()
+	
+	// Print and write feeds found for this host
+	if len(hostFeeds) > 0 {
+		for _, feed := range hostFeeds {
+			fmt.Printf("%s\n", feed.URL)
+			c.writeFeedToTLDFile(feed.URL, host)
+		}
+	}
+}
+
+func (c *Crawler) crawlPage(pageURL string, depth int, localVisited map[string]bool, pagesVisited *int) {
+	if *pagesVisited >= c.MaxPagesPerHost || depth > c.MaxDepth {
+		return
+	}
+	
+	if localVisited[pageURL] {
+		return
+	}
+	
+	// Check global visited
+	if _, visited := c.visited.LoadOrStore(pageURL, true); visited {
+		return
+	}
+	
+	localVisited[pageURL] = true
+	*pagesVisited++
+
+	body, contentType, err := c.fetchPage(pageURL)
+	if err != nil {
+		return
+	}
+
+	// Check if this page itself is a feed
+	if c.isFeedContent(body, contentType) {
+		feedType := c.detectFeedType(body)
+		c.addFeed(pageURL, feedType)
+		
+		// Extract links from the feed and crawl them
+		feedLinks := c.extractLinksFromFeed(body, feedType)
+		
+		for _, link := range feedLinks {
+			c.crawlPage(link, depth+1, localVisited, pagesVisited)
+		}
+		return
+	}
+
+	// Parse HTML and look for feed links
+	doc, err := html.Parse(strings.NewReader(body))
+	if err != nil {
+		return
+	}
+
+	// Find feed links in <link> tags
+	feedLinks := c.extractFeedLinks(doc, pageURL)
+	for _, feed := range feedLinks {
+		c.addFeed(feed.URL, feed.Type)
+	}
+
+	// Find feed links in anchor tags
+	anchorFeeds := c.extractAnchorFeeds(doc, pageURL)
+	for _, feed := range anchorFeeds {
+		c.addFeed(feed.URL, feed.Type)
+	}
+
+	// Extract all links for further crawling
+	if depth < c.MaxDepth {
+		links := c.extractLinks(doc, pageURL)
+		for _, link := range links {
+			if c.shouldCrawl(link, pageURL) {
+				c.crawlPage(link, depth+1, localVisited, pagesVisited)
+			}
+		}
+	}
+}
+
+func (c *Crawler) fetchPage(pageURL string) (string, string, error) {
+	req, err := http.NewRequest("GET", pageURL, nil)
+	if err != nil {
+		return "", "", err
+	}
+	req.Header.Set("User-Agent", c.UserAgent)
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return "", "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return "", "", fmt.Errorf("status code: %d", resp.StatusCode)
+	}
+
+	bodyBytes, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return "", "", err
+	}
+
+	contentType := resp.Header.Get("Content-Type")
+	return string(bodyBytes), contentType, nil
+}
+
+func (c *Crawler) isFeedContent(body, contentType string) bool {
+	if strings.Contains(contentType, "application/rss+xml") ||
+		strings.Contains(contentType, "application/atom+xml") ||
+		strings.Contains(contentType, "application/xml") ||
+		strings.Contains(contentType, "text/xml") {
+		return true
+	}
+
+	body = strings.TrimSpace(body)
+	if strings.HasPrefix(body, "<?xml") {
+		if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
+			return true
+		}
+	}
+	return false
+}
+
+func (c *Crawler) detectFeedType(body string) string {
+	if strings.Contains(body, "<rss") {
+		return "rss"
+	}
+	if strings.Contains(body, "<feed") && strings.Contains(body, "xmlns=\"http://www.w3.org/2005/Atom\"") {
+		return "atom"
+	}
+	return "unknown"
+}
+
+func (c *Crawler) extractLinksFromFeed(body, feedType string) []string {
+	links := make([]string, 0)
+	
+	switch feedType {
+	case "rss":
+		var rss RSS
+		if err := xml.Unmarshal([]byte(body), &rss); err != nil {
+			return links
+		}
+		
+		for _, item := range rss.Channel.Items {
+			if item.Link != "" {
+				links = append(links, item.Link)
+			}
+		}
+		
+	case "atom":
+		var atom AtomFeed
+		if err := xml.Unmarshal([]byte(body), &atom); err != nil {
+			return links
+		}
+		
+		for _, entry := range atom.Entries {
+			for _, link := range entry.Links {
+				if link.Rel == "" || link.Rel == "alternate" {
+					if link.Href != "" {
+						links = append(links, link.Href)
+					}
+				}
+			}
+		}
+	}
+	
+	return links
+}
+
+func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []Feed {
+	feeds := make([]Feed, 0)
+	var f func(*html.Node)
+	
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "link" {
+			var rel, href, typeAttr string
+			for _, attr := range n.Attr {
+				switch attr.Key {
+				case "rel":
+					rel = attr.Val
+				case "href":
+					href = attr.Val
+				case "type":
+					typeAttr = attr.Val
+				}
+			}
+
+			if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
+				absURL := c.makeAbsoluteURL(href, baseURL)
+				feedType := "rss"
+				if typeAttr == "application/atom+xml" {
+					feedType = "atom"
+				}
+				feeds = append(feeds, Feed{URL: absURL, Type: feedType})
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return feeds
+}
+
+func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []Feed {
+	feeds := make([]Feed, 0)
+	feedPattern := regexp.MustCompile(`(?i)(rss|atom|feed)`)
+	
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, attr := range n.Attr {
+				if attr.Key == "href" {
+					href := attr.Val
+					if feedPattern.MatchString(href) {
+						absURL := c.makeAbsoluteURL(href, baseURL)
+						feeds = append(feeds, Feed{URL: absURL, Type: "unknown"})
+					}
+				}
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return feeds
+}
+
+func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
+	links := make([]string, 0)
+	var f func(*html.Node)
+	
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, attr := range n.Attr {
+				if attr.Key == "href" {
+					link := c.makeAbsoluteURL(attr.Val, baseURL)
+					links = append(links, link)
+				}
+			}
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+	f(n)
+	return links
+}
+
+func (c *Crawler) makeAbsoluteURL(href, baseURL string) string {
+	base, err := url.Parse(baseURL)
+	if err != nil {
+		return href
+	}
+	
+	link, err := url.Parse(href)
+	if err != nil {
+		return href
+	}
+	
+	return base.ResolveReference(link).String()
+}
+
+func (c *Crawler) shouldCrawl(link, baseURL string) bool {
+	linkURL, err := url.Parse(link)
+	if err != nil {
+		return false
+	}
+	
+	baseURLParsed, err := url.Parse(baseURL)
+	if err != nil {
+		return false
+	}
+	
+	return linkURL.Host == baseURLParsed.Host
+}
+
+func (c *Crawler) addFeed(feedURL, feedType string) {
+	c.feedsMu.Lock()
+	defer c.feedsMu.Unlock()
+	
+	for _, f := range c.feeds {
+		if f.URL == feedURL {
+			return
+		}
+	}
+	
+	feed := Feed{URL: feedURL, Type: feedType}
+	c.feeds = append(c.feeds, feed)
+}
+
+func main() {
+	rand.Seed(time.Now().UnixNano())
+	
+	crawler := NewCrawler()
+
+	hosts, err := crawler.GetCommonCrawlHostsFromFile("vertices.txt.gz", 0)
+	if err != nil {
+		fmt.Printf("Error reading hosts from file: %v\n", err)
+		return
+	}
+
+	feeds, err := crawler.CrawlHosts(hosts)
+	if err != nil {
+		fmt.Printf("Error: %v\n", err)
+		return
+	}
+
+	fmt.Printf("=== Total feeds found: %d ===\n", len(feeds))
+}