Files
crawler/api_domains.go
primal 43916c8042 Exclude skip status domains from default API listing
When no status filter is provided, the domains API now excludes
domains with 'skip' status (including bare TLDs) by default.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 22:34:24 -05:00

769 lines
20 KiB
Go

package main
import (
"encoding/json"
"fmt"
"net/http"
"strings"
"github.com/jackc/pgx/v5"
)
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
// handleAPIDomains lists domains with optional status filter, including their feeds
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// First get domains
var rows pgx.Rows
var err error
if hasFeeds {
// Only domains with feeds
if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status != 'skip'
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
} else if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status != 'skip'
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type,omitempty"`
Status string `json:"status,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
}
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
Feeds []FeedInfo `json:"feeds,omitempty"`
}
var domains []DomainInfo
var hosts []string
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
hosts = append(hosts, d.Host)
}
// Now get feeds for these domains
if len(hosts) > 0 {
feedRows, err := c.db.Query(`
SELECT source_host, url, title, type, status, publish_status
FROM feeds
WHERE source_host = ANY($1)
ORDER BY source_host, url
`, hosts)
if err == nil {
defer feedRows.Close()
feedsByHost := make(map[string][]FeedInfo)
for feedRows.Next() {
var host string
var f FeedInfo
var title, feedType, status, publishStatus *string
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus); err != nil {
continue
}
f.Title = StringValue(title)
f.Type = StringValue(feedType)
f.Status = StringValue(status)
f.PublishStatus = StringValue(publishStatus)
feedsByHost[host] = append(feedsByHost[host], f)
}
// Attach feeds to domains
for i := range domains {
if feeds, ok := feedsByHost[domains[i].Host]; ok {
domains[i].Feeds = feeds
}
}
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
if status == "" {
http.Error(w, "status parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT url, title, type, status, error_count, last_error, item_count, publish_status, language
FROM feeds
WHERE source_host = $1
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, host, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"error_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, status, lastError, publishStatus, language *string
var errorCount, itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil {
continue
}
f.Title = StringValue(title)
f.Status = StringValue(status)
f.LastError = StringValue(lastError)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if errorCount != nil {
f.ErrorCount = *errorCount
}
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
// handleAPISetDomainStatus sets the status for a domain
// status must be 'hold', 'pass', 'skip', or 'fail'
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
status := r.URL.Query().Get("status")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
if status != "hold" && status != "pass" && status != "skip" && status != "fail" {
http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail'", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// When setting to pass, clear any last_error
var err error
if status == "pass" {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_error = NULL
WHERE host = $2
`, status, host)
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = $1
WHERE host = $2
`, status, host)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"host": host,
"status": status,
})
}
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
_, err := c.db.Exec(`
UPDATE domains SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
WHERE host = $1
`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
}
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Add domain if it doesn't exist, or reset to pass for crawling
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'pass', NOW(), $2)
ON CONFLICT(host) DO UPDATE SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
`, host, getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Crawl synchronously
fmt.Printf("Priority crawl: %s\n", host)
feedsFound, crawlErr := c.crawlHost(host)
errStr := ""
if crawlErr != nil {
errStr = crawlErr.Error()
}
// Mark as crawled
c.markDomainCrawled(host, feedsFound, errStr)
// Get the feeds we found
feeds, _ := c.GetFeedsByHost(host)
type FeedSummary struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Category string `json:"category"`
Status string `json:"status"`
}
var feedSummaries []FeedSummary
for _, f := range feeds {
feedSummaries = append(feedSummaries, FeedSummary{
URL: f.URL,
Title: f.Title,
Type: f.Type,
Category: f.Category,
Status: f.Status,
})
}
result := map[string]interface{}{
"host": host,
"feeds_found": feedsFound,
"feeds": feedSummaries,
}
if crawlErr != nil {
result["error"] = crawlErr.Error()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// handleAPIFilter handles flexible filtering with stackable parameters
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
domain := r.URL.Query().Get("domain")
feedStatus := r.URL.Query().Get("feedStatus")
domainStatus := r.URL.Query().Get("domainStatus")
languages := r.URL.Query().Get("languages") // comma-separated list
show := r.URL.Query().Get("show") // "feeds" or "domains"
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse languages into slice
var langList []string
if languages != "" {
for _, lang := range strings.Split(languages, ",") {
lang = strings.TrimSpace(lang)
if lang != "" {
langList = append(langList, lang)
}
}
}
// Determine what to show based on filters
if show == "" {
if feedStatus != "" || domain != "" || len(langList) > 0 {
show = "feeds"
} else {
show = "domains"
}
}
if show == "feeds" {
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
} else {
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
}
}
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
var args []interface{}
argNum := 1
query := `
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND d.status = $%d", argNum)
args = append(args, status)
argNum++
}
// Sort by feed count descending or alphabetically
if sort == "feeds" {
query += fmt.Sprintf(" ORDER BY feed_count DESC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
} else {
query += fmt.Sprintf(" ORDER BY d.tld ASC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
}
args = append(args, limit, offset)
rows, err := c.db.Query(query, args...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tldVal, lastError *string
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tldVal)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"type": "domains",
"data": domains,
})
}
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT d.host, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.tld = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, tld, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var lastError *string
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
var rows pgx.Rows
var err error
if hasFeeds {
// Only TLDs that have domains with feeds
rows, err = c.db.Query(`
SELECT DISTINCT d.tld, COUNT(DISTINCT d.host) as domain_count
FROM domains d
INNER JOIN feeds f ON d.host = f.source_host
WHERE d.tld IS NOT NULL AND d.tld != ''
GROUP BY d.tld
ORDER BY d.tld ASC
`)
} else {
// All TLDs
rows, err = c.db.Query(`
SELECT tld, COUNT(*) as domain_count
FROM domains
WHERE tld IS NOT NULL AND tld != ''
GROUP BY tld
ORDER BY tld ASC
`)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type TLDInfo struct {
TLD string `json:"tld"`
DomainCount int `json:"domain_count"`
}
var tlds []TLDInfo
for rows.Next() {
var t TLDInfo
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
continue
}
tlds = append(tlds, t)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(tlds)
}
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
var domainCount, feedCount int
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE tld = $1`, tld).Scan(&domainCount)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE tld = $1`, tld).Scan(&feedCount)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"tld": tld,
"domain_count": domainCount,
"feed_count": feedCount,
})
}
// handleAPIDenyDomain skips a domain and all its feeds
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Update domain status to skip
_, err := c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Skip all feeds from this domain
feedsAffected, err := c.db.Exec(`UPDATE feeds SET publish_status = 'skip', status = 'dead' WHERE source_host = $1`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"success": true,
"host": host,
"feeds_skipped": feedsAffected,
})
}
// handleAPIUndenyDomain removes skip status from a domain
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Update domain status back to pass
_, err := c.db.Exec(`UPDATE domains SET status = 'pass' WHERE host = $1 AND status = 'skip'`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Restore feeds to hold status and active
feedsRestored, err := c.db.Exec(`UPDATE feeds SET publish_status = 'hold', status = 'active' WHERE source_host = $1 AND status = 'dead'`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"success": true,
"host": host,
"feeds_restored": feedsRestored,
})
}