Split domain processing into separate check and crawl loops
- StartDomainCheckLoop: DNS verification for unchecked domains (1000 workers) - StartFeedCrawlLoop: Feed discovery on DNS-verified domains (100 workers) This fixes starvation where 104M unchecked domains blocked 1.2M DNS-verified domains from ever being crawled for feeds. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -141,19 +141,38 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||
return domain, nil
|
||||
}
|
||||
|
||||
// GetDomainsToProcess returns domains needing processing (domain_check or feed_crawl)
|
||||
// crawled_at = zero time means needs domain_check, +1 sec means needs feed_crawl
|
||||
// Domains with errors are retried when crawled_at < now (scheduled by ErrorRetryDelay)
|
||||
func (c *Crawler) GetDomainsToProcess(limit int) ([]*Domain, error) {
|
||||
// GetDomainsToCheck returns unchecked domains needing DNS lookup (domain_check)
|
||||
// crawled_at = zero time means needs domain_check
|
||||
func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, crawled_at, feeds_found, last_error, tld
|
||||
FROM domains
|
||||
WHERE status = 'pass'
|
||||
AND crawled_at = '0001-01-01 00:00:00'
|
||||
AND last_error IS NULL
|
||||
LIMIT $1
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return c.scanDomains(rows)
|
||||
}
|
||||
|
||||
// GetDomainsToCrawl returns DNS-verified domains needing feed discovery (feed_crawl)
|
||||
// crawled_at = +1 sec means passed DNS check, ready for crawl
|
||||
// Also includes domains with errors that are due for retry
|
||||
func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
|
||||
now := time.Now()
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, crawled_at, feeds_found, last_error, tld
|
||||
FROM domains
|
||||
WHERE status = 'pass' AND (
|
||||
(crawled_at < '0001-01-02' AND last_error IS NULL) -- new domains
|
||||
OR (crawled_at < $1 AND last_error IS NOT NULL) -- retry errors after delay
|
||||
(crawled_at = '0001-01-01 00:00:01' AND last_error IS NULL) -- passed DNS, ready to crawl
|
||||
OR (crawled_at < $1 AND crawled_at > '0001-01-01 00:00:01' AND last_error IS NOT NULL) -- retry errors
|
||||
)
|
||||
ORDER BY crawled_at ASC
|
||||
ORDER BY last_error IS NULL DESC, crawled_at ASC
|
||||
LIMIT $2
|
||||
`, now, limit)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user