Reduces from ~2B URLs to ~2-3M by filtering for URLs containing: rss, feed, atom, xml, syndication, frontpage, newest, etc. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
112 lines
2.9 KiB
Go
112 lines
2.9 KiB
Go
// extract-html queries local parquet files for text/html feeds
|
|
// and appends them to the TSV file for bulk import
|
|
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"database/sql"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
_ "github.com/duckdb/duckdb-go/v2"
|
|
)
|
|
|
|
func main() {
|
|
parquetDir := "cdx-data/parquet/CC-MAIN-2026-04"
|
|
outputFile := "cdx-data/CC-MAIN-2026-04-html.tsv"
|
|
|
|
// Find all parquet files
|
|
matches, err := filepath.Glob(filepath.Join(parquetDir, "*.parquet"))
|
|
if err != nil {
|
|
fmt.Printf("Error finding parquet files: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Printf("Found %d parquet files\n", len(matches))
|
|
|
|
// Open output file
|
|
f, err := os.Create(outputFile)
|
|
if err != nil {
|
|
fmt.Printf("Error creating output file: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer f.Close()
|
|
|
|
writer := bufio.NewWriter(f)
|
|
defer writer.Flush()
|
|
|
|
startTime := time.Now()
|
|
var totalFeeds int
|
|
|
|
// Process each file
|
|
for i, parquetFile := range matches {
|
|
count, err := queryParquet(parquetFile, writer)
|
|
if err != nil {
|
|
fmt.Printf("[%d/%d] %s - error: %v\n", i+1, len(matches), filepath.Base(parquetFile), err)
|
|
continue
|
|
}
|
|
totalFeeds += count
|
|
|
|
elapsed := time.Since(startTime)
|
|
rate := float64(i+1) / elapsed.Seconds() * 60
|
|
fmt.Printf("[%d/%d] %s - %d feeds (total: %d, %.1f files/min)\n",
|
|
i+1, len(matches), filepath.Base(parquetFile), count, totalFeeds, rate)
|
|
}
|
|
|
|
fmt.Printf("\nComplete: %d text/html feeds extracted to %s in %v\n",
|
|
totalFeeds, outputFile, time.Since(startTime).Round(time.Second))
|
|
}
|
|
|
|
func queryParquet(parquetFile string, writer *bufio.Writer) (int, error) {
|
|
db, err := sql.Open("duckdb", "")
|
|
if err != nil {
|
|
return 0, fmt.Errorf("duckdb open failed: %w", err)
|
|
}
|
|
defer db.Close()
|
|
|
|
// Query for text/html with URL patterns that suggest it might be a feed
|
|
// This filters from ~7M to hopefully a few thousand per file
|
|
query := fmt.Sprintf(`
|
|
SELECT DISTINCT url, url_host_name, content_mime_detected
|
|
FROM read_parquet('%s')
|
|
WHERE fetch_status = 200
|
|
AND content_mime_detected = 'text/html'
|
|
AND (
|
|
-- URL path contains feed-related keywords
|
|
LOWER(url) LIKE '%%/rss%%'
|
|
OR LOWER(url) LIKE '%%/feed%%'
|
|
OR LOWER(url) LIKE '%%/atom%%'
|
|
OR LOWER(url) LIKE '%%.rss%%'
|
|
OR LOWER(url) LIKE '%%.xml%%'
|
|
OR LOWER(url) LIKE '%%.atom%%'
|
|
OR LOWER(url) LIKE '%%/feeds/%%'
|
|
OR LOWER(url) LIKE '%%/syndication%%'
|
|
OR LOWER(url) LIKE '%%format=rss%%'
|
|
OR LOWER(url) LIKE '%%format=atom%%'
|
|
OR LOWER(url) LIKE '%%type=rss%%'
|
|
OR LOWER(url) LIKE '%%/frontpage%%'
|
|
OR LOWER(url) LIKE '%%/newest%%'
|
|
)
|
|
`, parquetFile)
|
|
|
|
rows, err := db.Query(query)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("query failed: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
|
|
count := 0
|
|
for rows.Next() {
|
|
var feedURL, host, mime string
|
|
if err := rows.Scan(&feedURL, &host, &mime); err != nil {
|
|
continue
|
|
}
|
|
fmt.Fprintf(writer, "%s\t%s\t%s\n", feedURL, host, mime)
|
|
count++
|
|
}
|
|
|
|
return count, rows.Err()
|
|
}
|