From 211812363aa19493f930c971a485d4327067be73 Mon Sep 17 00:00:00 2001 From: primal Date: Sun, 1 Feb 2026 19:07:43 -0500 Subject: [PATCH] Add TLD sync loop for IANA TLD updates Co-Authored-By: Claude Opus 4.5 --- tld.go | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 tld.go diff --git a/tld.go b/tld.go new file mode 100644 index 0000000..0f6ee2b --- /dev/null +++ b/tld.go @@ -0,0 +1,176 @@ +package main + +import ( + "bufio" + "fmt" + "net/http" + "strings" + "time" +) + +const ( + IANATLDListURL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" +) + +// fetchOfficialTLDs retrieves the current TLD list from IANA +func fetchOfficialTLDs() (map[string]bool, error) { + resp, err := http.Get(IANATLDListURL) + if err != nil { + return nil, fmt.Errorf("failed to fetch TLD list: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + tlds := make(map[string]bool) + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + // Skip comments and empty lines + if line == "" || strings.HasPrefix(line, "#") { + continue + } + // IANA list is uppercase, convert to lowercase + tlds[strings.ToLower(line)] = true + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading TLD list: %w", err) + } + + return tlds, nil +} + +// getEnumTLDs retrieves all TLDs currently in the tld_enum type +func (c *Crawler) getEnumTLDs() (map[string]bool, error) { + rows, err := c.db.Query(` + SELECT enumlabel FROM pg_enum + WHERE enumtypid = 'tld_enum'::regtype + `) + if err != nil { + return nil, err + } + defer rows.Close() + + tlds := make(map[string]bool) + for rows.Next() { + var tld string + if err := rows.Scan(&tld); err != nil { + continue + } + tlds[tld] = true + } + return tlds, rows.Err() +} + +// syncTLDs fetches the official IANA TLD list and: +// 1. Marks domains/feeds with unofficial TLDs as 'dead' +// 2. Adds any new official TLDs to the enum +func (c *Crawler) syncTLDs() error { + fmt.Println("TLD sync: fetching official TLD list from IANA...") + + officialTLDs, err := fetchOfficialTLDs() + if err != nil { + return fmt.Errorf("TLD sync failed: %w", err) + } + fmt.Printf("TLD sync: %d official TLDs from IANA\n", len(officialTLDs)) + + enumTLDs, err := c.getEnumTLDs() + if err != nil { + return fmt.Errorf("TLD sync: failed to get enum TLDs: %w", err) + } + fmt.Printf("TLD sync: %d TLDs in enum\n", len(enumTLDs)) + + // Find unofficial TLDs (in enum but not in IANA) + var unofficialTLDs []string + for tld := range enumTLDs { + if !officialTLDs[tld] { + unofficialTLDs = append(unofficialTLDs, tld) + } + } + + if len(unofficialTLDs) > 0 { + fmt.Printf("TLD sync: %d unofficial TLDs found\n", len(unofficialTLDs)) + + // Mark feeds with unofficial TLDs as dead + for _, tld := range unofficialTLDs { + result, err := c.db.Exec(` + UPDATE feeds SET status = 'dead' + WHERE tld = $1 AND status != 'dead' + `, tld) + if err != nil { + fmt.Printf("TLD sync: error marking feeds for TLD %s: %v\n", tld, err) + } else if result > 0 { + fmt.Printf("TLD sync: marked %d feeds as dead for TLD .%s\n", result, tld) + } + } + + // Mark domains with unofficial TLDs as dead + for _, tld := range unofficialTLDs { + result, err := c.db.Exec(` + UPDATE domains SET status = 'dead' + WHERE tld = $1::tld_enum AND status != 'dead' + `, tld) + if err != nil { + fmt.Printf("TLD sync: error marking domains for TLD %s: %v\n", tld, err) + } else if result > 0 { + fmt.Printf("TLD sync: marked %d domains as dead for TLD .%s\n", result, tld) + } + } + } + + // Find new TLDs (in IANA but not in enum) + var newTLDs []string + for tld := range officialTLDs { + if !enumTLDs[tld] { + newTLDs = append(newTLDs, tld) + } + } + + if len(newTLDs) > 0 { + fmt.Printf("TLD sync: %d new TLDs to add\n", len(newTLDs)) + + // Add new TLDs to enum + for _, tld := range newTLDs { + _, err := c.db.Exec(fmt.Sprintf(`ALTER TYPE tld_enum ADD VALUE IF NOT EXISTS '%s'`, tld)) + if err != nil { + fmt.Printf("TLD sync: error adding TLD %s: %v\n", tld, err) + } else { + fmt.Printf("TLD sync: added TLD .%s to enum\n", tld) + } + } + } + + fmt.Println("TLD sync: complete") + return nil +} + +// startTLDSyncLoop runs TLD sync on startup and daily at noon GMT +func (c *Crawler) startTLDSyncLoop() { + // Run immediately on startup + if err := c.syncTLDs(); err != nil { + fmt.Printf("TLD sync error: %v\n", err) + } + + go func() { + for { + // Calculate time until next noon GMT + now := time.Now().UTC() + nextNoon := time.Date(now.Year(), now.Month(), now.Day(), 12, 0, 0, 0, time.UTC) + if now.After(nextNoon) { + // Already past noon today, schedule for tomorrow + nextNoon = nextNoon.Add(24 * time.Hour) + } + sleepDuration := nextNoon.Sub(now) + + fmt.Printf("TLD sync: next sync at %s (in %s)\n", nextNoon.Format(time.RFC3339), sleepDuration.Round(time.Minute)) + time.Sleep(sleepDuration) + + if err := c.syncTLDs(); err != nil { + fmt.Printf("TLD sync error: %v\n", err) + } + } + }() +}