crawler/cmd/extract-html/main.go

// extract-html queries local parquet files for text/html feeds
// and appends them to the TSV file for bulk import
package main

import (
	"bufio"
	"database/sql"
	"fmt"
	"os"
	"path/filepath"
	"time"

	_ "github.com/duckdb/duckdb-go/v2"
)

func main() {
	parquetDir := "cdx-data/parquet/CC-MAIN-2026-04"
	outputFile := "cdx-data/CC-MAIN-2026-04-html.tsv"

	// Find all parquet files
	matches, err := filepath.Glob(filepath.Join(parquetDir, "*.parquet"))
	if err != nil {
		fmt.Printf("Error finding parquet files: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("Found %d parquet files\n", len(matches))

	// Open output file
	f, err := os.Create(outputFile)
	if err != nil {
		fmt.Printf("Error creating output file: %v\n", err)
		os.Exit(1)
	}
	defer f.Close()

	writer := bufio.NewWriter(f)
	defer writer.Flush()

	startTime := time.Now()
	var totalFeeds int

	// Process each file
	for i, parquetFile := range matches {
		count, err := queryParquet(parquetFile, writer)
		if err != nil {
			fmt.Printf("[%d/%d] %s - error: %v\n", i+1, len(matches), filepath.Base(parquetFile), err)
			continue
		}
		totalFeeds += count

		elapsed := time.Since(startTime)
		rate := float64(i+1) / elapsed.Seconds() * 60
		fmt.Printf("[%d/%d] %s - %d feeds (total: %d, %.1f files/min)\n",
			i+1, len(matches), filepath.Base(parquetFile), count, totalFeeds, rate)
	}

	fmt.Printf("\nComplete: %d text/html feeds extracted to %s in %v\n",
		totalFeeds, outputFile, time.Since(startTime).Round(time.Second))
}

func queryParquet(parquetFile string, writer *bufio.Writer) (int, error) {
	db, err := sql.Open("duckdb", "")
	if err != nil {
		return 0, fmt.Errorf("duckdb open failed: %w", err)
	}
	defer db.Close()

	// Query for text/html with URL patterns that suggest it might be a feed
	// This filters from ~7M to hopefully a few thousand per file
	query := fmt.Sprintf(`
		SELECT DISTINCT url, url_host_name, content_mime_detected
		FROM read_parquet('%s')
		WHERE fetch_status = 200
		  AND content_mime_detected = 'text/html'
		  AND (
		      -- URL path contains feed-related keywords
		      LOWER(url) LIKE '%%/rss%%'
		      OR LOWER(url) LIKE '%%/feed%%'
		      OR LOWER(url) LIKE '%%/atom%%'
		      OR LOWER(url) LIKE '%%.rss%%'
		      OR LOWER(url) LIKE '%%.xml%%'
		      OR LOWER(url) LIKE '%%.atom%%'
		      OR LOWER(url) LIKE '%%/feeds/%%'
		      OR LOWER(url) LIKE '%%/syndication%%'
		      OR LOWER(url) LIKE '%%format=rss%%'
		      OR LOWER(url) LIKE '%%format=atom%%'
		      OR LOWER(url) LIKE '%%type=rss%%'
		      OR LOWER(url) LIKE '%%/frontpage%%'
		      OR LOWER(url) LIKE '%%/newest%%'
		  )
	`, parquetFile)

	rows, err := db.Query(query)
	if err != nil {
		return 0, fmt.Errorf("query failed: %w", err)
	}
	defer rows.Close()

	count := 0
	for rows.Next() {
		var feedURL, host, mime string
		if err := rows.Scan(&feedURL, &host, &mime); err != nil {
			continue
		}
		fmt.Fprintf(writer, "%s\t%s\t%s\n", feedURL, host, mime)
		count++
	}

	return count, rows.Err()
}