v100
This commit is contained in:
@@ -15,7 +15,7 @@ import (
|
||||
)
|
||||
|
||||
// Domain represents a host to be crawled for feeds
|
||||
// Status: hold (pending review), pass (approved), skip (not processing), fail (error)
|
||||
// Status: hold (pending review), pass (approved), skip (not processing)
|
||||
type Domain struct {
|
||||
Host string `json:"host"`
|
||||
Status string `json:"status"`
|
||||
@@ -123,28 +123,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||
return domain, nil
|
||||
}
|
||||
|
||||
// GetDomainsToCheck returns domains ready for checking (status='pass', never checked)
|
||||
func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
|
||||
FROM domains WHERE status = 'pass' AND last_checked_at IS NULL
|
||||
ORDER BY discovered_at ASC
|
||||
LIMIT $1
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return c.scanDomains(rows)
|
||||
}
|
||||
|
||||
// GetDomainsToCrawl returns domains ready for crawling (status='pass', checked but not crawled)
|
||||
// GetDomainsToCrawl returns domains ready for crawling (status='pass', not yet crawled)
|
||||
func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
|
||||
FROM domains WHERE status = 'pass' AND last_checked_at IS NOT NULL AND last_crawled_at IS NULL
|
||||
ORDER BY discovered_at ASC
|
||||
FROM domains WHERE status = 'pass' AND last_crawled_at IS NULL
|
||||
ORDER BY discovered_at DESC
|
||||
LIMIT $1
|
||||
`, limit)
|
||||
if err != nil {
|
||||
@@ -180,29 +164,12 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
|
||||
return domains, rows.Err()
|
||||
}
|
||||
|
||||
// markDomainChecked updates a domain after the check (HEAD request) stage
|
||||
func (c *Crawler) markDomainChecked(host string, lastError string) error {
|
||||
now := time.Now()
|
||||
if lastError != "" {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE domains SET status = 'fail', last_checked_at = $1, last_error = $2
|
||||
WHERE host = $3
|
||||
`, now, lastError, normalizeHost(host))
|
||||
return err
|
||||
}
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE domains SET last_checked_at = $1, last_error = NULL
|
||||
WHERE host = $2
|
||||
`, now, normalizeHost(host))
|
||||
return err
|
||||
}
|
||||
|
||||
// markDomainCrawled updates a domain after the crawl stage
|
||||
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
|
||||
now := time.Now()
|
||||
if lastError != "" {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE domains SET status = 'fail', last_crawled_at = $1, feeds_found = $2, last_error = $3
|
||||
UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = $3
|
||||
WHERE host = $4
|
||||
`, now, feedsFound, lastError, normalizeHost(host))
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user