Files
crawler/tld.go
2026-02-01 23:52:29 -05:00

177 lines
4.5 KiB
Go

package main
import (
"bufio"
"fmt"
"net/http"
"strings"
"time"
)
const (
IANATLDListURL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
)
// fetchOfficialTLDs retrieves the current TLD list from IANA
func fetchOfficialTLDs() (map[string]bool, error) {
resp, err := http.Get(IANATLDListURL)
if err != nil {
return nil, fmt.Errorf("failed to fetch TLD list: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
tlds := make(map[string]bool)
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines
if line == "" || strings.HasPrefix(line, "#") {
continue
}
// IANA list is uppercase, convert to lowercase
tlds[strings.ToLower(line)] = true
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading TLD list: %w", err)
}
return tlds, nil
}
// getEnumTLDs retrieves all TLDs currently in the tld_enum type
func (c *Crawler) getEnumTLDs() (map[string]bool, error) {
rows, err := c.db.Query(`
SELECT enumlabel FROM pg_enum
WHERE enumtypid = 'tld_enum'::regtype
`)
if err != nil {
return nil, err
}
defer rows.Close()
tlds := make(map[string]bool)
for rows.Next() {
var tld string
if err := rows.Scan(&tld); err != nil {
continue
}
tlds[tld] = true
}
return tlds, rows.Err()
}
// syncTLDs fetches the official IANA TLD list and:
// 1. Marks domains/feeds with unofficial TLDs as 'dead'
// 2. Adds any new official TLDs to the enum
func (c *Crawler) syncTLDs() error {
fmt.Println("TLD sync: fetching official TLD list from IANA...")
officialTLDs, err := fetchOfficialTLDs()
if err != nil {
return fmt.Errorf("TLD sync failed: %w", err)
}
fmt.Printf("TLD sync: %d official TLDs from IANA\n", len(officialTLDs))
enumTLDs, err := c.getEnumTLDs()
if err != nil {
return fmt.Errorf("TLD sync: failed to get enum TLDs: %w", err)
}
fmt.Printf("TLD sync: %d TLDs in enum\n", len(enumTLDs))
// Find unofficial TLDs (in enum but not in IANA)
var unofficialTLDs []string
for tld := range enumTLDs {
if !officialTLDs[tld] {
unofficialTLDs = append(unofficialTLDs, tld)
}
}
if len(unofficialTLDs) > 0 {
fmt.Printf("TLD sync: %d unofficial TLDs found\n", len(unofficialTLDs))
// Mark feeds with unofficial TLDs as dead
for _, tld := range unofficialTLDs {
result, err := c.db.Exec(`
UPDATE feeds SET status = 'dead'
WHERE domain_tld = $1::tld_enum AND status != 'dead'
`, tld)
if err != nil {
fmt.Printf("TLD sync: error marking feeds for TLD %s: %v\n", tld, err)
} else if result > 0 {
fmt.Printf("TLD sync: marked %d feeds as dead for TLD .%s\n", result, tld)
}
}
// Mark domains with unofficial TLDs as dead
for _, tld := range unofficialTLDs {
result, err := c.db.Exec(`
UPDATE domains SET status = 'dead'
WHERE tld = $1::tld_enum AND status != 'dead'
`, tld)
if err != nil {
fmt.Printf("TLD sync: error marking domains for TLD %s: %v\n", tld, err)
} else if result > 0 {
fmt.Printf("TLD sync: marked %d domains as dead for TLD .%s\n", result, tld)
}
}
}
// Find new TLDs (in IANA but not in enum)
var newTLDs []string
for tld := range officialTLDs {
if !enumTLDs[tld] {
newTLDs = append(newTLDs, tld)
}
}
if len(newTLDs) > 0 {
fmt.Printf("TLD sync: %d new TLDs to add\n", len(newTLDs))
// Add new TLDs to enum
for _, tld := range newTLDs {
_, err := c.db.Exec(fmt.Sprintf(`ALTER TYPE tld_enum ADD VALUE IF NOT EXISTS '%s'`, tld))
if err != nil {
fmt.Printf("TLD sync: error adding TLD %s: %v\n", tld, err)
} else {
fmt.Printf("TLD sync: added TLD .%s to enum\n", tld)
}
}
}
fmt.Println("TLD sync: complete")
return nil
}
// startTLDSyncLoop runs TLD sync on startup and daily at noon GMT
func (c *Crawler) startTLDSyncLoop() {
// Run immediately on startup
if err := c.syncTLDs(); err != nil {
fmt.Printf("TLD sync error: %v\n", err)
}
go func() {
for {
// Calculate time until next noon GMT
now := time.Now().UTC()
nextNoon := time.Date(now.Year(), now.Month(), now.Day(), 12, 0, 0, 0, time.UTC)
if now.After(nextNoon) {
// Already past noon today, schedule for tomorrow
nextNoon = nextNoon.Add(24 * time.Hour)
}
sleepDuration := nextNoon.Sub(now)
fmt.Printf("TLD sync: next sync at %s (in %s)\n", nextNoon.Format(time.RFC3339), sleepDuration.Round(time.Minute))
time.Sleep(sleepDuration)
if err := c.syncTLDs(); err != nil {
fmt.Printf("TLD sync error: %v\n", err)
}
}
}()
}