Enable all TLDs for import and auto-deny spam domains
- Remove .com-only filter from vertices import - Add shouldAutoDenyDomain() to detect spam patterns (^[0-9]-) - Apply auto-deny to all domain insert paths (save, saveTx, bulk import) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -25,8 +25,23 @@ type Domain struct {
|
|||||||
TLD string `json:"tld,omitempty"`
|
TLD string `json:"tld,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns
|
||||||
|
func shouldAutoDenyDomain(host string) bool {
|
||||||
|
// Deny domains starting with digit followed by dash (e.g., "0-example.com")
|
||||||
|
if len(host) >= 2 && host[0] >= '0' && host[0] <= '9' && host[1] == '-' {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// saveDomain stores a domain in PostgreSQL
|
// saveDomain stores a domain in PostgreSQL
|
||||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||||
|
// Auto-deny domains matching spam patterns
|
||||||
|
status := domain.Status
|
||||||
|
if shouldAutoDenyDomain(domain.Host) {
|
||||||
|
status = "denied"
|
||||||
|
}
|
||||||
|
|
||||||
_, err := c.db.Exec(`
|
_, err := c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||||
@@ -36,18 +51,24 @@ func (c *Crawler) saveDomain(domain *Domain) error {
|
|||||||
feeds_found = EXCLUDED.feeds_found,
|
feeds_found = EXCLUDED.feeds_found,
|
||||||
last_error = EXCLUDED.last_error,
|
last_error = EXCLUDED.last_error,
|
||||||
tld = EXCLUDED.tld
|
tld = EXCLUDED.tld
|
||||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// saveDomainTx stores a domain using a transaction
|
// saveDomainTx stores a domain using a transaction
|
||||||
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
||||||
|
// Auto-deny domains matching spam patterns
|
||||||
|
status := domain.Status
|
||||||
|
if shouldAutoDenyDomain(domain.Host) {
|
||||||
|
status = "denied"
|
||||||
|
}
|
||||||
|
|
||||||
_, err := tx.Exec(context.Background(), `
|
_, err := tx.Exec(context.Background(), `
|
||||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host) DO NOTHING
|
||||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -242,10 +263,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
if reverseHostName != "" {
|
if reverseHostName != "" {
|
||||||
host := normalizeHost(reverseHost(reverseHostName))
|
host := normalizeHost(reverseHost(reverseHostName))
|
||||||
tld := getTLD(host)
|
tld := getTLD(host)
|
||||||
// Only import .com domains for now
|
domains = append(domains, domainEntry{host: host, tld: tld})
|
||||||
if tld == "com" {
|
|
||||||
domains = append(domains, domainEntry{host: host, tld: tld})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -262,10 +280,14 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build rows for copy
|
// Build rows for copy, applying auto-deny for spam patterns
|
||||||
rows := make([][]interface{}, len(domains))
|
rows := make([][]interface{}, len(domains))
|
||||||
for i, d := range domains {
|
for i, d := range domains {
|
||||||
rows[i] = []interface{}{d.host, "unchecked", now, d.tld}
|
status := "unchecked"
|
||||||
|
if shouldAutoDenyDomain(d.host) {
|
||||||
|
status = "denied"
|
||||||
|
}
|
||||||
|
rows[i] = []interface{}{d.host, status, now, d.tld}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use CopyFrom for bulk insert
|
// Use CopyFrom for bulk insert
|
||||||
@@ -280,11 +302,15 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
// Fall back to individual inserts with ON CONFLICT
|
// Fall back to individual inserts with ON CONFLICT
|
||||||
for _, d := range domains {
|
for _, d := range domains {
|
||||||
|
status := "unchecked"
|
||||||
|
if shouldAutoDenyDomain(d.host) {
|
||||||
|
status = "denied"
|
||||||
|
}
|
||||||
c.db.Exec(`
|
c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, tld)
|
INSERT INTO domains (host, status, discovered_at, tld)
|
||||||
VALUES ($1, 'unchecked', $2, $3)
|
VALUES ($1, $2, $3, $4)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host) DO NOTHING
|
||||||
`, d.host, now, d.tld)
|
`, d.host, status, now, d.tld)
|
||||||
}
|
}
|
||||||
imported = int64(len(domains))
|
imported = int64(len(domains))
|
||||||
}
|
}
|
||||||
@@ -361,13 +387,17 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert with ON CONFLICT
|
// Insert with ON CONFLICT, applying auto-deny for spam patterns
|
||||||
for _, d := range domains {
|
for _, d := range domains {
|
||||||
|
status := "unchecked"
|
||||||
|
if shouldAutoDenyDomain(d.host) {
|
||||||
|
status = "denied"
|
||||||
|
}
|
||||||
result, err := c.db.Exec(`
|
result, err := c.db.Exec(`
|
||||||
INSERT INTO domains (host, status, discovered_at, tld)
|
INSERT INTO domains (host, status, discovered_at, tld)
|
||||||
VALUES ($1, 'unchecked', $2, $3)
|
VALUES ($1, $2, $3, $4)
|
||||||
ON CONFLICT(host) DO NOTHING
|
ON CONFLICT(host) DO NOTHING
|
||||||
`, d.host, now, d.tld)
|
`, d.host, status, now, d.tld)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
skipped++
|
skipped++
|
||||||
} else if result > 0 {
|
} else if result > 0 {
|
||||||
|
|||||||
Reference in New Issue
Block a user