Files
crawler/cmd/extract-html/main.go
primal 091fa8490b Filter text/html extraction by feed-like URL patterns
Reduces from ~2B URLs to ~2-3M by filtering for URLs containing:
rss, feed, atom, xml, syndication, frontpage, newest, etc.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 14:30:40 -05:00

112 lines
2.9 KiB
Go

// extract-html queries local parquet files for text/html feeds
// and appends them to the TSV file for bulk import
package main
import (
"bufio"
"database/sql"
"fmt"
"os"
"path/filepath"
"time"
_ "github.com/duckdb/duckdb-go/v2"
)
func main() {
parquetDir := "cdx-data/parquet/CC-MAIN-2026-04"
outputFile := "cdx-data/CC-MAIN-2026-04-html.tsv"
// Find all parquet files
matches, err := filepath.Glob(filepath.Join(parquetDir, "*.parquet"))
if err != nil {
fmt.Printf("Error finding parquet files: %v\n", err)
os.Exit(1)
}
fmt.Printf("Found %d parquet files\n", len(matches))
// Open output file
f, err := os.Create(outputFile)
if err != nil {
fmt.Printf("Error creating output file: %v\n", err)
os.Exit(1)
}
defer f.Close()
writer := bufio.NewWriter(f)
defer writer.Flush()
startTime := time.Now()
var totalFeeds int
// Process each file
for i, parquetFile := range matches {
count, err := queryParquet(parquetFile, writer)
if err != nil {
fmt.Printf("[%d/%d] %s - error: %v\n", i+1, len(matches), filepath.Base(parquetFile), err)
continue
}
totalFeeds += count
elapsed := time.Since(startTime)
rate := float64(i+1) / elapsed.Seconds() * 60
fmt.Printf("[%d/%d] %s - %d feeds (total: %d, %.1f files/min)\n",
i+1, len(matches), filepath.Base(parquetFile), count, totalFeeds, rate)
}
fmt.Printf("\nComplete: %d text/html feeds extracted to %s in %v\n",
totalFeeds, outputFile, time.Since(startTime).Round(time.Second))
}
func queryParquet(parquetFile string, writer *bufio.Writer) (int, error) {
db, err := sql.Open("duckdb", "")
if err != nil {
return 0, fmt.Errorf("duckdb open failed: %w", err)
}
defer db.Close()
// Query for text/html with URL patterns that suggest it might be a feed
// This filters from ~7M to hopefully a few thousand per file
query := fmt.Sprintf(`
SELECT DISTINCT url, url_host_name, content_mime_detected
FROM read_parquet('%s')
WHERE fetch_status = 200
AND content_mime_detected = 'text/html'
AND (
-- URL path contains feed-related keywords
LOWER(url) LIKE '%%/rss%%'
OR LOWER(url) LIKE '%%/feed%%'
OR LOWER(url) LIKE '%%/atom%%'
OR LOWER(url) LIKE '%%.rss%%'
OR LOWER(url) LIKE '%%.xml%%'
OR LOWER(url) LIKE '%%.atom%%'
OR LOWER(url) LIKE '%%/feeds/%%'
OR LOWER(url) LIKE '%%/syndication%%'
OR LOWER(url) LIKE '%%format=rss%%'
OR LOWER(url) LIKE '%%format=atom%%'
OR LOWER(url) LIKE '%%type=rss%%'
OR LOWER(url) LIKE '%%/frontpage%%'
OR LOWER(url) LIKE '%%/newest%%'
)
`, parquetFile)
rows, err := db.Query(query)
if err != nil {
return 0, fmt.Errorf("query failed: %w", err)
}
defer rows.Close()
count := 0
for rows.Next() {
var feedURL, host, mime string
if err := rows.Scan(&feedURL, &host, &mime); err != nil {
continue
}
fmt.Fprintf(writer, "%s\t%s\t%s\n", feedURL, host, mime)
count++
}
return count, rows.Err()
}