package main import ( "bufio" "fmt" "net/http" "strings" "time" ) const ( IANATLDListURL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" ) // fetchOfficialTLDs retrieves the current TLD list from IANA func fetchOfficialTLDs() (map[string]bool, error) { resp, err := http.Get(IANATLDListURL) if err != nil { return nil, fmt.Errorf("failed to fetch TLD list: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) } tlds := make(map[string]bool) scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) // Skip comments and empty lines if line == "" || strings.HasPrefix(line, "#") { continue } // IANA list is uppercase, convert to lowercase tlds[strings.ToLower(line)] = true } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("error reading TLD list: %w", err) } return tlds, nil } // getEnumTLDs retrieves all TLDs currently in the tld_enum type func (c *Crawler) getEnumTLDs() (map[string]bool, error) { rows, err := c.db.Query(` SELECT enumlabel FROM pg_enum WHERE enumtypid = 'tld_enum'::regtype `) if err != nil { return nil, err } defer rows.Close() tlds := make(map[string]bool) for rows.Next() { var tld string if err := rows.Scan(&tld); err != nil { continue } tlds[tld] = true } return tlds, rows.Err() } // syncTLDs fetches the official IANA TLD list and: // 1. Marks domains/feeds with unofficial TLDs as 'dead' // 2. Adds any new official TLDs to the enum func (c *Crawler) syncTLDs() error { fmt.Println("TLD sync: fetching official TLD list from IANA...") officialTLDs, err := fetchOfficialTLDs() if err != nil { return fmt.Errorf("TLD sync failed: %w", err) } fmt.Printf("TLD sync: %d official TLDs from IANA\n", len(officialTLDs)) enumTLDs, err := c.getEnumTLDs() if err != nil { return fmt.Errorf("TLD sync: failed to get enum TLDs: %w", err) } fmt.Printf("TLD sync: %d TLDs in enum\n", len(enumTLDs)) // Find unofficial TLDs (in enum but not in IANA) var unofficialTLDs []string for tld := range enumTLDs { if !officialTLDs[tld] { unofficialTLDs = append(unofficialTLDs, tld) } } if len(unofficialTLDs) > 0 { fmt.Printf("TLD sync: %d unofficial TLDs found\n", len(unofficialTLDs)) // Mark feeds with unofficial TLDs as dead for _, tld := range unofficialTLDs { result, err := c.db.Exec(` UPDATE feeds SET status = 'dead' WHERE domain_tld = $1::tld_enum AND status != 'dead' `, tld) if err != nil { fmt.Printf("TLD sync: error marking feeds for TLD %s: %v\n", tld, err) } else if result > 0 { fmt.Printf("TLD sync: marked %d feeds as dead for TLD .%s\n", result, tld) } } // Mark domains with unofficial TLDs as dead for _, tld := range unofficialTLDs { result, err := c.db.Exec(` UPDATE domains SET status = 'dead' WHERE tld = $1::tld_enum AND status != 'dead' `, tld) if err != nil { fmt.Printf("TLD sync: error marking domains for TLD %s: %v\n", tld, err) } else if result > 0 { fmt.Printf("TLD sync: marked %d domains as dead for TLD .%s\n", result, tld) } } } // Find new TLDs (in IANA but not in enum) var newTLDs []string for tld := range officialTLDs { if !enumTLDs[tld] { newTLDs = append(newTLDs, tld) } } if len(newTLDs) > 0 { fmt.Printf("TLD sync: %d new TLDs to add\n", len(newTLDs)) // Add new TLDs to enum for _, tld := range newTLDs { _, err := c.db.Exec(fmt.Sprintf(`ALTER TYPE tld_enum ADD VALUE IF NOT EXISTS '%s'`, tld)) if err != nil { fmt.Printf("TLD sync: error adding TLD %s: %v\n", tld, err) } else { fmt.Printf("TLD sync: added TLD .%s to enum\n", tld) } } } fmt.Println("TLD sync: complete") return nil } // startTLDSyncLoop runs TLD sync on startup and daily at noon GMT func (c *Crawler) startTLDSyncLoop() { // Run immediately on startup if err := c.syncTLDs(); err != nil { fmt.Printf("TLD sync error: %v\n", err) } go func() { for { // Calculate time until next noon GMT now := time.Now().UTC() nextNoon := time.Date(now.Year(), now.Month(), now.Day(), 12, 0, 0, 0, time.UTC) if now.After(nextNoon) { // Already past noon today, schedule for tomorrow nextNoon = nextNoon.Add(24 * time.Hour) } sleepDuration := nextNoon.Sub(now) fmt.Printf("TLD sync: next sync at %s (in %s)\n", nextNoon.Format(time.RFC3339), sleepDuration.Round(time.Minute)) time.Sleep(sleepDuration) if err := c.syncTLDs(); err != nil { fmt.Printf("TLD sync error: %v\n", err) } } }() }