This commit is contained in:
primal
2026-01-30 22:35:08 -05:00
parent f49fc2f0ad
commit be595cb403
14 changed files with 341 additions and 544 deletions
+5 -38
View File
@@ -15,7 +15,7 @@ import (
)
// Domain represents a host to be crawled for feeds
// Status: hold (pending review), pass (approved), skip (not processing), fail (error)
// Status: hold (pending review), pass (approved), skip (not processing)
type Domain struct {
Host string `json:"host"`
Status string `json:"status"`
@@ -123,28 +123,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
return domain, nil
}
// GetDomainsToCheck returns domains ready for checking (status='pass', never checked)
func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'pass' AND last_checked_at IS NULL
ORDER BY discovered_at ASC
LIMIT $1
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return c.scanDomains(rows)
}
// GetDomainsToCrawl returns domains ready for crawling (status='pass', checked but not crawled)
// GetDomainsToCrawl returns domains ready for crawling (status='pass', not yet crawled)
func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'pass' AND last_checked_at IS NOT NULL AND last_crawled_at IS NULL
ORDER BY discovered_at ASC
FROM domains WHERE status = 'pass' AND last_crawled_at IS NULL
ORDER BY discovered_at DESC
LIMIT $1
`, limit)
if err != nil {
@@ -180,29 +164,12 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
return domains, rows.Err()
}
// markDomainChecked updates a domain after the check (HEAD request) stage
func (c *Crawler) markDomainChecked(host string, lastError string) error {
now := time.Now()
if lastError != "" {
_, err := c.db.Exec(`
UPDATE domains SET status = 'fail', last_checked_at = $1, last_error = $2
WHERE host = $3
`, now, lastError, normalizeHost(host))
return err
}
_, err := c.db.Exec(`
UPDATE domains SET last_checked_at = $1, last_error = NULL
WHERE host = $2
`, now, normalizeHost(host))
return err
}
// markDomainCrawled updates a domain after the crawl stage
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
now := time.Now()
if lastError != "" {
_, err := c.db.Exec(`
UPDATE domains SET status = 'fail', last_crawled_at = $1, feeds_found = $2, last_error = $3
UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = $3
WHERE host = $4
`, now, feedsFound, lastError, normalizeHost(host))
return err