177 lines
4.5 KiB
Go
177 lines
4.5 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
IANATLDListURL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
|
)
|
|
|
|
// fetchOfficialTLDs retrieves the current TLD list from IANA
|
|
func fetchOfficialTLDs() (map[string]bool, error) {
|
|
resp, err := http.Get(IANATLDListURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch TLD list: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
|
}
|
|
|
|
tlds := make(map[string]bool)
|
|
scanner := bufio.NewScanner(resp.Body)
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
// Skip comments and empty lines
|
|
if line == "" || strings.HasPrefix(line, "#") {
|
|
continue
|
|
}
|
|
// IANA list is uppercase, convert to lowercase
|
|
tlds[strings.ToLower(line)] = true
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
return nil, fmt.Errorf("error reading TLD list: %w", err)
|
|
}
|
|
|
|
return tlds, nil
|
|
}
|
|
|
|
// getEnumTLDs retrieves all TLDs currently in the tld_enum type
|
|
func (c *Crawler) getEnumTLDs() (map[string]bool, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT enumlabel FROM pg_enum
|
|
WHERE enumtypid = 'tld_enum'::regtype
|
|
`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
tlds := make(map[string]bool)
|
|
for rows.Next() {
|
|
var tld string
|
|
if err := rows.Scan(&tld); err != nil {
|
|
continue
|
|
}
|
|
tlds[tld] = true
|
|
}
|
|
return tlds, rows.Err()
|
|
}
|
|
|
|
// syncTLDs fetches the official IANA TLD list and:
|
|
// 1. Marks domains/feeds with unofficial TLDs as 'dead'
|
|
// 2. Adds any new official TLDs to the enum
|
|
func (c *Crawler) syncTLDs() error {
|
|
fmt.Println("TLD sync: fetching official TLD list from IANA...")
|
|
|
|
officialTLDs, err := fetchOfficialTLDs()
|
|
if err != nil {
|
|
return fmt.Errorf("TLD sync failed: %w", err)
|
|
}
|
|
fmt.Printf("TLD sync: %d official TLDs from IANA\n", len(officialTLDs))
|
|
|
|
enumTLDs, err := c.getEnumTLDs()
|
|
if err != nil {
|
|
return fmt.Errorf("TLD sync: failed to get enum TLDs: %w", err)
|
|
}
|
|
fmt.Printf("TLD sync: %d TLDs in enum\n", len(enumTLDs))
|
|
|
|
// Find unofficial TLDs (in enum but not in IANA)
|
|
var unofficialTLDs []string
|
|
for tld := range enumTLDs {
|
|
if !officialTLDs[tld] {
|
|
unofficialTLDs = append(unofficialTLDs, tld)
|
|
}
|
|
}
|
|
|
|
if len(unofficialTLDs) > 0 {
|
|
fmt.Printf("TLD sync: %d unofficial TLDs found\n", len(unofficialTLDs))
|
|
|
|
// Mark feeds with unofficial TLDs as dead
|
|
for _, tld := range unofficialTLDs {
|
|
result, err := c.db.Exec(`
|
|
UPDATE feeds SET status = 'dead'
|
|
WHERE domain_tld = $1::tld_enum AND status != 'dead'
|
|
`, tld)
|
|
if err != nil {
|
|
fmt.Printf("TLD sync: error marking feeds for TLD %s: %v\n", tld, err)
|
|
} else if result > 0 {
|
|
fmt.Printf("TLD sync: marked %d feeds as dead for TLD .%s\n", result, tld)
|
|
}
|
|
}
|
|
|
|
// Mark domains with unofficial TLDs as dead
|
|
for _, tld := range unofficialTLDs {
|
|
result, err := c.db.Exec(`
|
|
UPDATE domains SET status = 'dead'
|
|
WHERE tld = $1::tld_enum AND status != 'dead'
|
|
`, tld)
|
|
if err != nil {
|
|
fmt.Printf("TLD sync: error marking domains for TLD %s: %v\n", tld, err)
|
|
} else if result > 0 {
|
|
fmt.Printf("TLD sync: marked %d domains as dead for TLD .%s\n", result, tld)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find new TLDs (in IANA but not in enum)
|
|
var newTLDs []string
|
|
for tld := range officialTLDs {
|
|
if !enumTLDs[tld] {
|
|
newTLDs = append(newTLDs, tld)
|
|
}
|
|
}
|
|
|
|
if len(newTLDs) > 0 {
|
|
fmt.Printf("TLD sync: %d new TLDs to add\n", len(newTLDs))
|
|
|
|
// Add new TLDs to enum
|
|
for _, tld := range newTLDs {
|
|
_, err := c.db.Exec(fmt.Sprintf(`ALTER TYPE tld_enum ADD VALUE IF NOT EXISTS '%s'`, tld))
|
|
if err != nil {
|
|
fmt.Printf("TLD sync: error adding TLD %s: %v\n", tld, err)
|
|
} else {
|
|
fmt.Printf("TLD sync: added TLD .%s to enum\n", tld)
|
|
}
|
|
}
|
|
}
|
|
|
|
fmt.Println("TLD sync: complete")
|
|
return nil
|
|
}
|
|
|
|
// startTLDSyncLoop runs TLD sync on startup and daily at noon GMT
|
|
func (c *Crawler) startTLDSyncLoop() {
|
|
// Run immediately on startup
|
|
if err := c.syncTLDs(); err != nil {
|
|
fmt.Printf("TLD sync error: %v\n", err)
|
|
}
|
|
|
|
go func() {
|
|
for {
|
|
// Calculate time until next noon GMT
|
|
now := time.Now().UTC()
|
|
nextNoon := time.Date(now.Year(), now.Month(), now.Day(), 12, 0, 0, 0, time.UTC)
|
|
if now.After(nextNoon) {
|
|
// Already past noon today, schedule for tomorrow
|
|
nextNoon = nextNoon.Add(24 * time.Hour)
|
|
}
|
|
sleepDuration := nextNoon.Sub(now)
|
|
|
|
fmt.Printf("TLD sync: next sync at %s (in %s)\n", nextNoon.Format(time.RFC3339), sleepDuration.Round(time.Minute))
|
|
time.Sleep(sleepDuration)
|
|
|
|
if err := c.syncTLDs(); err != nil {
|
|
fmt.Printf("TLD sync error: %v\n", err)
|
|
}
|
|
}
|
|
}()
|
|
}
|