Add TLD sync loop for IANA TLD updates
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
IANATLDListURL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||
)
|
||||
|
||||
// fetchOfficialTLDs retrieves the current TLD list from IANA
|
||||
func fetchOfficialTLDs() (map[string]bool, error) {
|
||||
resp, err := http.Get(IANATLDListURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch TLD list: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
tlds := make(map[string]bool)
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
// Skip comments and empty lines
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
// IANA list is uppercase, convert to lowercase
|
||||
tlds[strings.ToLower(line)] = true
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("error reading TLD list: %w", err)
|
||||
}
|
||||
|
||||
return tlds, nil
|
||||
}
|
||||
|
||||
// getEnumTLDs retrieves all TLDs currently in the tld_enum type
|
||||
func (c *Crawler) getEnumTLDs() (map[string]bool, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT enumlabel FROM pg_enum
|
||||
WHERE enumtypid = 'tld_enum'::regtype
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
tlds := make(map[string]bool)
|
||||
for rows.Next() {
|
||||
var tld string
|
||||
if err := rows.Scan(&tld); err != nil {
|
||||
continue
|
||||
}
|
||||
tlds[tld] = true
|
||||
}
|
||||
return tlds, rows.Err()
|
||||
}
|
||||
|
||||
// syncTLDs fetches the official IANA TLD list and:
|
||||
// 1. Marks domains/feeds with unofficial TLDs as 'dead'
|
||||
// 2. Adds any new official TLDs to the enum
|
||||
func (c *Crawler) syncTLDs() error {
|
||||
fmt.Println("TLD sync: fetching official TLD list from IANA...")
|
||||
|
||||
officialTLDs, err := fetchOfficialTLDs()
|
||||
if err != nil {
|
||||
return fmt.Errorf("TLD sync failed: %w", err)
|
||||
}
|
||||
fmt.Printf("TLD sync: %d official TLDs from IANA\n", len(officialTLDs))
|
||||
|
||||
enumTLDs, err := c.getEnumTLDs()
|
||||
if err != nil {
|
||||
return fmt.Errorf("TLD sync: failed to get enum TLDs: %w", err)
|
||||
}
|
||||
fmt.Printf("TLD sync: %d TLDs in enum\n", len(enumTLDs))
|
||||
|
||||
// Find unofficial TLDs (in enum but not in IANA)
|
||||
var unofficialTLDs []string
|
||||
for tld := range enumTLDs {
|
||||
if !officialTLDs[tld] {
|
||||
unofficialTLDs = append(unofficialTLDs, tld)
|
||||
}
|
||||
}
|
||||
|
||||
if len(unofficialTLDs) > 0 {
|
||||
fmt.Printf("TLD sync: %d unofficial TLDs found\n", len(unofficialTLDs))
|
||||
|
||||
// Mark feeds with unofficial TLDs as dead
|
||||
for _, tld := range unofficialTLDs {
|
||||
result, err := c.db.Exec(`
|
||||
UPDATE feeds SET status = 'dead'
|
||||
WHERE tld = $1 AND status != 'dead'
|
||||
`, tld)
|
||||
if err != nil {
|
||||
fmt.Printf("TLD sync: error marking feeds for TLD %s: %v\n", tld, err)
|
||||
} else if result > 0 {
|
||||
fmt.Printf("TLD sync: marked %d feeds as dead for TLD .%s\n", result, tld)
|
||||
}
|
||||
}
|
||||
|
||||
// Mark domains with unofficial TLDs as dead
|
||||
for _, tld := range unofficialTLDs {
|
||||
result, err := c.db.Exec(`
|
||||
UPDATE domains SET status = 'dead'
|
||||
WHERE tld = $1::tld_enum AND status != 'dead'
|
||||
`, tld)
|
||||
if err != nil {
|
||||
fmt.Printf("TLD sync: error marking domains for TLD %s: %v\n", tld, err)
|
||||
} else if result > 0 {
|
||||
fmt.Printf("TLD sync: marked %d domains as dead for TLD .%s\n", result, tld)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find new TLDs (in IANA but not in enum)
|
||||
var newTLDs []string
|
||||
for tld := range officialTLDs {
|
||||
if !enumTLDs[tld] {
|
||||
newTLDs = append(newTLDs, tld)
|
||||
}
|
||||
}
|
||||
|
||||
if len(newTLDs) > 0 {
|
||||
fmt.Printf("TLD sync: %d new TLDs to add\n", len(newTLDs))
|
||||
|
||||
// Add new TLDs to enum
|
||||
for _, tld := range newTLDs {
|
||||
_, err := c.db.Exec(fmt.Sprintf(`ALTER TYPE tld_enum ADD VALUE IF NOT EXISTS '%s'`, tld))
|
||||
if err != nil {
|
||||
fmt.Printf("TLD sync: error adding TLD %s: %v\n", tld, err)
|
||||
} else {
|
||||
fmt.Printf("TLD sync: added TLD .%s to enum\n", tld)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("TLD sync: complete")
|
||||
return nil
|
||||
}
|
||||
|
||||
// startTLDSyncLoop runs TLD sync on startup and daily at noon GMT
|
||||
func (c *Crawler) startTLDSyncLoop() {
|
||||
// Run immediately on startup
|
||||
if err := c.syncTLDs(); err != nil {
|
||||
fmt.Printf("TLD sync error: %v\n", err)
|
||||
}
|
||||
|
||||
go func() {
|
||||
for {
|
||||
// Calculate time until next noon GMT
|
||||
now := time.Now().UTC()
|
||||
nextNoon := time.Date(now.Year(), now.Month(), now.Day(), 12, 0, 0, 0, time.UTC)
|
||||
if now.After(nextNoon) {
|
||||
// Already past noon today, schedule for tomorrow
|
||||
nextNoon = nextNoon.Add(24 * time.Hour)
|
||||
}
|
||||
sleepDuration := nextNoon.Sub(now)
|
||||
|
||||
fmt.Printf("TLD sync: next sync at %s (in %s)\n", nextNoon.Format(time.RFC3339), sleepDuration.Round(time.Minute))
|
||||
time.Sleep(sleepDuration)
|
||||
|
||||
if err := c.syncTLDs(); err != nil {
|
||||
fmt.Printf("TLD sync error: %v\n", err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
Reference in New Issue
Block a user