From f780c493c218cd17e266a80ab41ebaa5073b8387 Mon Sep 17 00:00:00 2001 From: primal Date: Thu, 29 Jan 2026 12:49:52 -0500 Subject: [PATCH] Enable all TLDs for import and auto-deny spam domains - Remove .com-only filter from vertices import - Add shouldAutoDenyDomain() to detect spam patterns (^[0-9]-) - Apply auto-deny to all domain insert paths (save, saveTx, bulk import) Co-Authored-By: Claude Opus 4.5 --- domain.go | 56 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/domain.go b/domain.go index 95ae455..8c6b6ba 100644 --- a/domain.go +++ b/domain.go @@ -25,8 +25,23 @@ type Domain struct { TLD string `json:"tld,omitempty"` } +// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns +func shouldAutoDenyDomain(host string) bool { + // Deny domains starting with digit followed by dash (e.g., "0-example.com") + if len(host) >= 2 && host[0] >= '0' && host[0] <= '9' && host[1] == '-' { + return true + } + return false +} + // saveDomain stores a domain in PostgreSQL func (c *Crawler) saveDomain(domain *Domain) error { + // Auto-deny domains matching spam patterns + status := domain.Status + if shouldAutoDenyDomain(domain.Host) { + status = "denied" + } + _, err := c.db.Exec(` INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld) VALUES ($1, $2, $3, $4, $5, $6, $7) @@ -36,18 +51,24 @@ func (c *Crawler) saveDomain(domain *Domain) error { feeds_found = EXCLUDED.feeds_found, last_error = EXCLUDED.last_error, tld = EXCLUDED.tld - `, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), + `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD) return err } // saveDomainTx stores a domain using a transaction func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error { + // Auto-deny domains matching spam patterns + status := domain.Status + if shouldAutoDenyDomain(domain.Host) { + status = "denied" + } + _, err := tx.Exec(context.Background(), ` INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld) VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT(host) DO NOTHING - `, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), + `, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD) return err } @@ -242,10 +263,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { if reverseHostName != "" { host := normalizeHost(reverseHost(reverseHostName)) tld := getTLD(host) - // Only import .com domains for now - if tld == "com" { - domains = append(domains, domainEntry{host: host, tld: tld}) - } + domains = append(domains, domainEntry{host: host, tld: tld}) } } } @@ -262,10 +280,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { break } - // Build rows for copy + // Build rows for copy, applying auto-deny for spam patterns rows := make([][]interface{}, len(domains)) for i, d := range domains { - rows[i] = []interface{}{d.host, "unchecked", now, d.tld} + status := "unchecked" + if shouldAutoDenyDomain(d.host) { + status = "denied" + } + rows[i] = []interface{}{d.host, status, now, d.tld} } // Use CopyFrom for bulk insert @@ -280,11 +302,15 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { if err != nil { // Fall back to individual inserts with ON CONFLICT for _, d := range domains { + status := "unchecked" + if shouldAutoDenyDomain(d.host) { + status = "denied" + } c.db.Exec(` INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, 'unchecked', $2, $3) + VALUES ($1, $2, $3, $4) ON CONFLICT(host) DO NOTHING - `, d.host, now, d.tld) + `, d.host, status, now, d.tld) } imported = int64(len(domains)) } @@ -361,13 +387,17 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in break } - // Insert with ON CONFLICT + // Insert with ON CONFLICT, applying auto-deny for spam patterns for _, d := range domains { + status := "unchecked" + if shouldAutoDenyDomain(d.host) { + status = "denied" + } result, err := c.db.Exec(` INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, 'unchecked', $2, $3) + VALUES ($1, $2, $3, $4) ON CONFLICT(host) DO NOTHING - `, d.host, now, d.tld) + `, d.host, status, now, d.tld) if err != nil { skipped++ } else if result > 0 {