Enable all TLDs for import and auto-deny spam domains
- Remove .com-only filter from vertices import - Add shouldAutoDenyDomain() to detect spam patterns (^[0-9]-) - Apply auto-deny to all domain insert paths (save, saveTx, bulk import) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -25,8 +25,23 @@ type Domain struct {
|
||||
TLD string `json:"tld,omitempty"`
|
||||
}
|
||||
|
||||
// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns
|
||||
func shouldAutoDenyDomain(host string) bool {
|
||||
// Deny domains starting with digit followed by dash (e.g., "0-example.com")
|
||||
if len(host) >= 2 && host[0] >= '0' && host[0] <= '9' && host[1] == '-' {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// saveDomain stores a domain in PostgreSQL
|
||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||
// Auto-deny domains matching spam patterns
|
||||
status := domain.Status
|
||||
if shouldAutoDenyDomain(domain.Host) {
|
||||
status = "denied"
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
@@ -36,18 +51,24 @@ func (c *Crawler) saveDomain(domain *Domain) error {
|
||||
feeds_found = EXCLUDED.feeds_found,
|
||||
last_error = EXCLUDED.last_error,
|
||||
tld = EXCLUDED.tld
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveDomainTx stores a domain using a transaction
|
||||
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
||||
// Auto-deny domains matching spam patterns
|
||||
status := domain.Status
|
||||
if shouldAutoDenyDomain(domain.Host) {
|
||||
status = "denied"
|
||||
}
|
||||
|
||||
_, err := tx.Exec(context.Background(), `
|
||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||
return err
|
||||
}
|
||||
@@ -242,10 +263,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
if reverseHostName != "" {
|
||||
host := normalizeHost(reverseHost(reverseHostName))
|
||||
tld := getTLD(host)
|
||||
// Only import .com domains for now
|
||||
if tld == "com" {
|
||||
domains = append(domains, domainEntry{host: host, tld: tld})
|
||||
}
|
||||
domains = append(domains, domainEntry{host: host, tld: tld})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -262,10 +280,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
break
|
||||
}
|
||||
|
||||
// Build rows for copy
|
||||
// Build rows for copy, applying auto-deny for spam patterns
|
||||
rows := make([][]interface{}, len(domains))
|
||||
for i, d := range domains {
|
||||
rows[i] = []interface{}{d.host, "unchecked", now, d.tld}
|
||||
status := "unchecked"
|
||||
if shouldAutoDenyDomain(d.host) {
|
||||
status = "denied"
|
||||
}
|
||||
rows[i] = []interface{}{d.host, status, now, d.tld}
|
||||
}
|
||||
|
||||
// Use CopyFrom for bulk insert
|
||||
@@ -280,11 +302,15 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
if err != nil {
|
||||
// Fall back to individual inserts with ON CONFLICT
|
||||
for _, d := range domains {
|
||||
status := "unchecked"
|
||||
if shouldAutoDenyDomain(d.host) {
|
||||
status = "denied"
|
||||
}
|
||||
c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'unchecked', $2, $3)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, d.host, now, d.tld)
|
||||
`, d.host, status, now, d.tld)
|
||||
}
|
||||
imported = int64(len(domains))
|
||||
}
|
||||
@@ -361,13 +387,17 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
||||
break
|
||||
}
|
||||
|
||||
// Insert with ON CONFLICT
|
||||
// Insert with ON CONFLICT, applying auto-deny for spam patterns
|
||||
for _, d := range domains {
|
||||
status := "unchecked"
|
||||
if shouldAutoDenyDomain(d.host) {
|
||||
status = "denied"
|
||||
}
|
||||
result, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'unchecked', $2, $3)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, d.host, now, d.tld)
|
||||
`, d.host, status, now, d.tld)
|
||||
if err != nil {
|
||||
skipped++
|
||||
} else if result > 0 {
|
||||
|
||||
Reference in New Issue
Block a user