Enable all TLDs for import and auto-deny spam domains

- Remove .com-only filter from vertices import
- Add shouldAutoDenyDomain() to detect spam patterns (^[0-9]-)
- Apply auto-deny to all domain insert paths (save, saveTx, bulk import)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-29 12:49:52 -05:00
parent 254b751799
commit f780c493c2
+43 -13
View File
@@ -25,8 +25,23 @@ type Domain struct {
TLD string `json:"tld,omitempty"` TLD string `json:"tld,omitempty"`
} }
// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns
func shouldAutoDenyDomain(host string) bool {
// Deny domains starting with digit followed by dash (e.g., "0-example.com")
if len(host) >= 2 && host[0] >= '0' && host[0] <= '9' && host[1] == '-' {
return true
}
return false
}
// saveDomain stores a domain in PostgreSQL // saveDomain stores a domain in PostgreSQL
func (c *Crawler) saveDomain(domain *Domain) error { func (c *Crawler) saveDomain(domain *Domain) error {
// Auto-deny domains matching spam patterns
status := domain.Status
if shouldAutoDenyDomain(domain.Host) {
status = "denied"
}
_, err := c.db.Exec(` _, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld) INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7) VALUES ($1, $2, $3, $4, $5, $6, $7)
@@ -36,18 +51,24 @@ func (c *Crawler) saveDomain(domain *Domain) error {
feeds_found = EXCLUDED.feeds_found, feeds_found = EXCLUDED.feeds_found,
last_error = EXCLUDED.last_error, last_error = EXCLUDED.last_error,
tld = EXCLUDED.tld tld = EXCLUDED.tld
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD) domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err return err
} }
// saveDomainTx stores a domain using a transaction // saveDomainTx stores a domain using a transaction
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error { func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
// Auto-deny domains matching spam patterns
status := domain.Status
if shouldAutoDenyDomain(domain.Host) {
status = "denied"
}
_, err := tx.Exec(context.Background(), ` _, err := tx.Exec(context.Background(), `
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld) INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7) VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT(host) DO NOTHING ON CONFLICT(host) DO NOTHING
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD) domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err return err
} }
@@ -242,10 +263,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
if reverseHostName != "" { if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName)) host := normalizeHost(reverseHost(reverseHostName))
tld := getTLD(host) tld := getTLD(host)
// Only import .com domains for now domains = append(domains, domainEntry{host: host, tld: tld})
if tld == "com" {
domains = append(domains, domainEntry{host: host, tld: tld})
}
} }
} }
} }
@@ -262,10 +280,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
break break
} }
// Build rows for copy // Build rows for copy, applying auto-deny for spam patterns
rows := make([][]interface{}, len(domains)) rows := make([][]interface{}, len(domains))
for i, d := range domains { for i, d := range domains {
rows[i] = []interface{}{d.host, "unchecked", now, d.tld} status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
}
rows[i] = []interface{}{d.host, status, now, d.tld}
} }
// Use CopyFrom for bulk insert // Use CopyFrom for bulk insert
@@ -280,11 +302,15 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
if err != nil { if err != nil {
// Fall back to individual inserts with ON CONFLICT // Fall back to individual inserts with ON CONFLICT
for _, d := range domains { for _, d := range domains {
status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
}
c.db.Exec(` c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld) INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3) VALUES ($1, $2, $3, $4)
ON CONFLICT(host) DO NOTHING ON CONFLICT(host) DO NOTHING
`, d.host, now, d.tld) `, d.host, status, now, d.tld)
} }
imported = int64(len(domains)) imported = int64(len(domains))
} }
@@ -361,13 +387,17 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
break break
} }
// Insert with ON CONFLICT // Insert with ON CONFLICT, applying auto-deny for spam patterns
for _, d := range domains { for _, d := range domains {
status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
}
result, err := c.db.Exec(` result, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld) INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3) VALUES ($1, $2, $3, $4)
ON CONFLICT(host) DO NOTHING ON CONFLICT(host) DO NOTHING
`, d.host, now, d.tld) `, d.host, status, now, d.tld)
if err != nil { if err != nil {
skipped++ skipped++
} else if result > 0 { } else if result > 0 {