Files
crawler/api_domains.go
primal 7ec4207173 Migrate to normalized FK schema (domain_host, domain_tld)
Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 22:36:25 -05:00

2069 lines
64 KiB
Go

package main
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"os"
"strings"
"github.com/jackc/pgx/v5"
)
// buildTLDSearchQuery builds a query to get TLDs based on search type
// Returns (query, args) for the database query
func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
switch sq.Type {
case "domain":
// Check if pattern includes TLD (e.g., d:npr.org -> exact match)
hostPart, tldFilter := parseSearchTerm(sq.Pattern)
if tldFilter != "" {
// Exact match - return just the matching TLD
return `
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{tldFilter, strings.ToLower(hostPart)}
}
// Pattern match - search all TLDs
return `
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE LOWER(host) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "url":
// Search feed URL paths (after domain)
return `
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "title":
// Search feed titles
return `
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "description":
// Search feed descriptions
return `
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "item":
// Search item titles
return `
SELECT f.tld, COUNT(DISTINCT f.domain_host || '.' || f.domain_tld) as domain_count
FROM feeds f
INNER JOIN items i ON i.feed_url = f.url
WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1
GROUP BY f.tld
ORDER BY f.tld ASC
`, []interface{}{pattern}
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
return `
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
-- Domains matching host pattern
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
UNION
-- Exact domain match
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) = $2 AND tld::text = $3
UNION
-- Feeds matching URL
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
}
return `
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
-- Domains matching host
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
UNION
-- Feeds matching URL
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
}
}
// buildDomainSearchQuery builds a query to get domains based on search type
// Returns (whereClause, args, argNum) to append to the base query
func buildDomainSearchQuery(sq SearchQuery, tldFilter string, argNum int) (string, []interface{}, int) {
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
var where string
var args []interface{}
switch sq.Type {
case "domain":
if sq.ExactMatch && tldFilter != "" {
// d:npr.org -> exact match
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) = $%d", argNum, argNum+1)
args = []interface{}{tldFilter, strings.ToLower(sq.Pattern)}
argNum += 2
} else if tldFilter != "" {
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) LIKE $%d", argNum, argNum+1)
args = []interface{}{tldFilter, pattern}
argNum += 2
} else {
where = fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
}
case "url":
where = fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "title":
where = fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "description":
where = fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "item":
// Need to join items - handled separately
where = fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
default:
// "all" - search everything, also include exact domain match if pattern looks like a domain
if tldFilter != "" {
if sq.DomainHost != "" && sq.DomainTLD != "" {
where = fmt.Sprintf(` AND d.tld = $%d AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1, argNum+2, argNum+3)
args = []interface{}{tldFilter, pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
argNum += 4
} else {
where = fmt.Sprintf(` AND d.tld = $%d AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1)
args = []interface{}{tldFilter, pattern}
argNum += 2
}
} else {
if sq.DomainHost != "" && sq.DomainTLD != "" {
where = fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
args = []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
argNum += 3
} else {
where = fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = []interface{}{pattern}
argNum++
}
}
}
return where, args, argNum
}
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
// handleAPIDomains lists domains with optional status filter, including their feeds
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
search := r.URL.Query().Get("search")
tldFilter := r.URL.Query().Get("tld")
feedMode := r.URL.Query().Get("feedMode") // include or exclude
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse comma-separated values
var statusList, typeList []string
if feedStatuses != "" {
statusList = strings.Split(feedStatuses, ",")
}
if feedTypes != "" {
typeList = strings.Split(feedTypes, ",")
}
// Parse search prefix for type-specific searching
var searchQuery SearchQuery
if search != "" {
searchQuery = parseSearchPrefix(search)
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
// All other searches use the literal pattern
if searchQuery.Type == "domain" {
hostPart, detectedTLD := parseSearchTerm(searchQuery.Pattern)
if detectedTLD != "" {
searchQuery.Pattern = hostPart
searchQuery.ExactMatch = true // d:npr.org matches exactly npr.org
if tldFilter == "" {
tldFilter = detectedTLD
}
}
}
}
// First get domains
var rows pgx.Rows
var err error
// If feed filter is specified, query domains that have matching feeds
if len(statusList) > 0 || len(typeList) > 0 || feedMode != "" {
// Build dynamic query to get domains with matching feeds
query := `
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found
FROM domains d
INNER JOIN feeds f ON f.domain_host = d.host AND f.domain_tld = d.tld
WHERE 1=1`
args := []interface{}{}
argNum := 1
if tldFilter != "" {
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND d.status = $%d", argNum)
args = append(args, status)
argNum++
}
// Handle status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
query += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
} else {
query += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", argNum)
}
args = append(args, statusList)
argNum++
}
// Handle type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
// Exclude mode
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND f.item_count > 0"
}
} else {
// Include mode
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND (f.item_count IS NULL OR f.item_count = 0)"
}
}
}
if search != "" && searchQuery.Pattern != "" {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
switch searchQuery.Type {
case "domain":
if searchQuery.ExactMatch {
// d:npr.org -> exact match for host "npr" (tld already filtered above)
query += fmt.Sprintf(" AND LOWER(d.host) = $%d", argNum)
args = append(args, strings.ToLower(searchQuery.Pattern))
} else {
// d:npr -> pattern match
query += fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
args = append(args, searchPattern)
}
argNum++
case "url":
query += fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "title":
query += fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "description":
query += fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "item":
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
args = append(args, searchPattern)
argNum++
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if searchQuery.DomainHost != "" && searchQuery.DomainTLD != "" {
query += fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
args = append(args, searchPattern, strings.ToLower(searchQuery.DomainHost), strings.ToLower(searchQuery.DomainTLD))
argNum += 3
} else {
query += fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = append(args, searchPattern)
argNum++
}
}
}
query += fmt.Sprintf(" ORDER BY d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
args = append(args, limit, offset)
rows, err = c.db.Query(query, args...)
} else if hasFeeds {
// Only domains with feeds
searchPattern := "%" + strings.ToLower(search) + "%"
if tldFilter != "" && status != "" {
// Filter by specific TLD and status
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.tld = $1 AND d.status = $2
ORDER BY d.host ASC
LIMIT $3 OFFSET $4
`, tldFilter, status, limit, offset)
} else if tldFilter != "" {
// Filter by specific TLD only (exclude 'skip' by default)
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip' AND d.tld = $1
ORDER BY d.host ASC
LIMIT $2 OFFSET $3
`, tldFilter, limit, offset)
} else if search != "" {
// Search in domain host only (uses trigram index)
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, searchPattern, limit, offset)
} else if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip'
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
} else if tldFilter != "" && search != "" && status != "" {
// Filter by TLD, status, and search
if searchQuery.ExactMatch {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) = $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, strings.ToLower(searchQuery.Pattern), limit, offset)
} else if searchQuery.DomainHost != "" && strings.ToLower(searchQuery.DomainTLD) == strings.ToLower(tldFilter) {
// Domain-like search with matching TLD - search for exact host
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) = $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, strings.ToLower(searchQuery.DomainHost), limit, offset)
} else {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) LIKE $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, searchPattern, limit, offset)
}
} else if tldFilter != "" && search != "" {
// Filter by TLD and search
// If search looks like a domain with matching TLD, use DomainHost for exact/pattern match
if searchQuery.ExactMatch {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, strings.ToLower(searchQuery.Pattern), limit, offset)
} else if searchQuery.DomainHost != "" && strings.ToLower(searchQuery.DomainTLD) == strings.ToLower(tldFilter) {
// Domain-like search with matching TLD - search for exact host or pattern
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, strings.ToLower(searchQuery.DomainHost), limit, offset)
} else {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) LIKE $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, searchPattern, limit, offset)
}
} else if tldFilter != "" && status != "" {
// Filter by TLD and status
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, status, limit, offset)
} else if tldFilter != "" {
// Filter by TLD only (show all statuses)
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1
ORDER BY host ASC
LIMIT $2 OFFSET $3
`, tldFilter, limit, offset)
} else if status != "" {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status = $1
ORDER BY tld ASC, host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status != 'skip'
ORDER BY tld ASC, host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type,omitempty"`
Status string `json:"status,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
ItemCount int `json:"item_count,omitempty"`
}
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
Feeds []FeedInfo `json:"feeds,omitempty"`
}
var domains []DomainInfo
var hosts []string
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
// Build full domain for feed lookup (source_host = host.tld)
fullDomain := d.Host
if d.TLD != "" {
fullDomain = d.Host + "." + d.TLD
}
hosts = append(hosts, fullDomain)
}
// Now get feeds for these domains (with actual item count from items table)
// Apply the same feed filters used for domain selection
if len(hosts) > 0 {
feedQuery := `
SELECT f.domain_host || '.' || f.domain_tld as source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
FROM feeds f
WHERE f.domain_host || '.' || f.domain_tld = ANY($1)`
feedArgs := []interface{}{hosts}
feedArgNum := 2
// Apply feed status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
feedQuery += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
} else {
feedQuery += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", feedArgNum)
}
feedArgs = append(feedArgs, statusList)
feedArgNum++
}
// Apply feed type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
if len(regularTypes) > 0 && hasEmpty {
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if len(regularTypes) > 0 {
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if hasEmpty {
feedQuery += " AND f.item_count > 0"
}
} else {
if len(regularTypes) > 0 && hasEmpty {
feedQuery += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if len(regularTypes) > 0 {
feedQuery += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if hasEmpty {
feedQuery += " AND (f.item_count IS NULL OR f.item_count = 0)"
}
}
}
feedQuery += " ORDER BY f.domain_host, f.domain_tld, f.url"
feedRows, err := c.db.Query(feedQuery, feedArgs...)
if err == nil {
defer feedRows.Close()
feedsByHost := make(map[string][]FeedInfo)
for feedRows.Next() {
var host string
var f FeedInfo
var title, feedType, status, publishStatus, language *string
var itemCount *int
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus, &language, &itemCount); err != nil {
continue
}
f.Title = StringValue(title)
f.Type = StringValue(feedType)
f.Status = StringValue(status)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if itemCount != nil {
f.ItemCount = *itemCount
}
feedsByHost[host] = append(feedsByHost[host], f)
}
// Attach feeds to domains (feedsByHost is keyed by full domain)
for i := range domains {
fullHost := domains[i].Host
if domains[i].TLD != "" {
fullHost = domains[i].Host + "." + domains[i].TLD
}
if feeds, ok := feedsByHost[fullHost]; ok {
domains[i].Feeds = feeds
}
}
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
if status == "" {
http.Error(w, "status parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status = $1
ORDER BY tld ASC, host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse host into domain_host and domain_tld
domainHost := stripTLD(host)
domainTLD := getTLD(host)
rows, err := c.db.Query(`
SELECT url, title, type, status, last_error, item_count, publish_status, language
FROM feeds
WHERE domain_host = $1 AND domain_tld = $2
ORDER BY url ASC
LIMIT $3 OFFSET $4
`, domainHost, domainTLD, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Status string `json:"status,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, status, lastError, publishStatus, language *string
var itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil {
continue
}
f.Title = StringValue(title)
f.Status = StringValue(status)
f.LastError = StringValue(lastError)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
// handleAPISetDomainStatus sets the status for a domain
// status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop')
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
status := r.URL.Query().Get("status")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
if status != "hold" && status != "pass" && status != "skip" {
http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Setting to 'skip' triggers takedown (hide content but preserve data)
if status == "skip" {
result := c.skipDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
return
}
// When setting to pass, clear any last_error
var err error
strippedHost := stripTLD(host)
tld := getTLD(host)
if status == "pass" {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_error = NULL
WHERE host = $2 AND tld = $3
`, status, strippedHost, tld)
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = $1
WHERE host = $2 AND tld = $3
`, status, strippedHost, tld)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"host": host,
"status": status,
})
}
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
_, err := c.db.Exec(`
UPDATE domains SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
WHERE host = $1 AND tld = $2
`, stripTLD(host), getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
}
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Add domain if it doesn't exist, or reset to pass for crawling
_, err := c.db.Exec(`
INSERT INTO domains (host, status, tld)
VALUES ($1, 'pass', $2)
ON CONFLICT(host, tld) DO UPDATE SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
`, stripTLD(host), getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Crawl synchronously
fmt.Printf("Priority crawl: %s\n", host)
feedsFound, crawlErr := c.feedCrawl(host)
errStr := ""
if crawlErr != nil {
errStr = crawlErr.Error()
}
// Mark as crawled
c.markDomainCrawled(stripTLD(host), getTLD(host), feedsFound, errStr)
// Get the feeds we found
feeds, _ := c.GetFeedsByHost(host)
type FeedSummary struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Category string `json:"category"`
Status string `json:"status"`
}
var feedSummaries []FeedSummary
for _, f := range feeds {
feedSummaries = append(feedSummaries, FeedSummary{
URL: f.URL,
Title: f.Title,
Type: f.Type,
Category: f.Category,
Status: f.Status,
})
}
result := map[string]interface{}{
"host": host,
"feeds_found": feedsFound,
"feeds": feedSummaries,
}
if crawlErr != nil {
result["error"] = crawlErr.Error()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// handleAPIFilter handles flexible filtering with stackable parameters
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
domain := r.URL.Query().Get("domain")
feedStatus := r.URL.Query().Get("feedStatus")
domainStatus := r.URL.Query().Get("domainStatus")
languages := r.URL.Query().Get("languages") // comma-separated list
show := r.URL.Query().Get("show") // "feeds" or "domains"
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse languages into slice
var langList []string
if languages != "" {
for _, lang := range strings.Split(languages, ",") {
lang = strings.TrimSpace(lang)
if lang != "" {
langList = append(langList, lang)
}
}
}
// Determine what to show based on filters
if show == "" {
if feedStatus != "" || domain != "" || len(langList) > 0 {
show = "feeds"
} else {
show = "domains"
}
}
if show == "feeds" {
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
} else {
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
}
}
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
var args []interface{}
argNum := 1
query := `
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND status = $%d", argNum)
args = append(args, status)
argNum++
}
// Sort by feed count descending or alphabetically
if sort == "feeds" {
query += fmt.Sprintf(" ORDER BY feeds_found DESC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
} else {
query += fmt.Sprintf(" ORDER BY tld ASC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
}
args = append(args, limit, offset)
rows, err := c.db.Query(query, args...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tldVal, lastError *string
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tldVal)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"type": "domains",
"data": domains,
})
}
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT host, status, last_error, feeds_found
FROM domains
WHERE tld = $1
ORDER BY host ASC
LIMIT $2 OFFSET $3
`, tld, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var lastError *string
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status") // domain status: pass, skip, hold, dead
feedMode := r.URL.Query().Get("feedMode") // include or exclude
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated: pass,skip,hold,dead
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated: rss,atom,json,unknown,empty
search := r.URL.Query().Get("search") // search query
// Parse comma-separated values
var statusList, typeList []string
if feedStatuses != "" {
statusList = strings.Split(feedStatuses, ",")
}
if feedTypes != "" {
typeList = strings.Split(feedTypes, ",")
}
var rows pgx.Rows
var err error
// If feed filter is specified, query from feeds table instead
if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" {
// Build query to get TLDs from feeds
query := `SELECT domain_tld as tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE domain_tld IS NOT NULL`
args := []interface{}{}
argNum := 1
// Handle status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
query += fmt.Sprintf(" AND (publish_status IS NULL OR publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
} else {
query += fmt.Sprintf(" AND publish_status IN (SELECT unnest($%d::text[]))", argNum)
}
args = append(args, statusList)
argNum++
}
// Handle type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
// Exclude mode: exclude these types
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND type NOT IN (SELECT unnest($%d::text[])) AND item_count > 0", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND (type IS NULL OR type NOT IN (SELECT unnest($%d::text[])))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND item_count > 0"
}
} else {
// Include mode: include these types
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (type IN (SELECT unnest($%d::text[])) OR item_count IS NULL OR item_count = 0)", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND type IN (SELECT unnest($%d::text[]))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND (item_count IS NULL OR item_count = 0)"
}
}
}
if search != "" {
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
var tldFilter string
var exactMatch bool
hostSearchPattern := searchPattern
if sq.Type == "domain" {
hostPattern, detectedTLD := parseSearchTerm(sq.Pattern)
if detectedTLD != "" {
tldFilter = detectedTLD
exactMatch = true
hostSearchPattern = "%" + strings.ToLower(hostPattern) + "%"
}
}
switch sq.Type {
case "domain":
// Search domain names
if exactMatch && tldFilter != "" {
// d:npr.org -> exact match
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) = $%d", argNum)
args = append(args, strings.ToLower(sq.Pattern))
} else if tldFilter != "" {
query += fmt.Sprintf(" AND domain_tld = $%d AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum, argNum+1)
args = append(args, tldFilter, hostSearchPattern)
} else {
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum)
args = append(args, hostSearchPattern)
}
case "url":
query += fmt.Sprintf(" AND LOWER(url) LIKE $%d", argNum)
args = append(args, searchPattern)
case "title":
query += fmt.Sprintf(" AND LOWER(title) LIKE $%d", argNum)
args = append(args, searchPattern)
case "description":
query += fmt.Sprintf(" AND LOWER(description) LIKE $%d", argNum)
args = append(args, searchPattern)
case "item":
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = feeds.url AND LOWER(i.title) LIKE $%d)", argNum)
args = append(args, searchPattern)
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
query += fmt.Sprintf(` AND (
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d OR
LOWER(domain_host || '.' || domain_tld) = $%d
)`, argNum, argNum, argNum, argNum, argNum+1)
args = append(args, searchPattern, fullDomain)
} else {
query += fmt.Sprintf(` AND (
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = append(args, searchPattern)
}
}
}
query += " GROUP BY domain_tld ORDER BY domain_tld ASC"
rows, err = c.db.Query(query, args...)
} else if search != "" {
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
// Use the helper to build the TLD search query
query, args := buildTLDSearchQuery(sq)
rows, err = c.db.Query(query, args...)
} else if status != "" {
// TLDs filtered by domain status
rows, err = c.db.Query(`
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE tld IS NOT NULL AND status = $1
GROUP BY tld
HAVING COUNT(*) > 0
ORDER BY tld ASC
`, status)
} else {
// All TLDs from enum with domain counts
rows, err = c.db.Query(`
SELECT e.enumlabel as tld, COALESCE(d.cnt, 0) as domain_count
FROM pg_enum e
LEFT JOIN (
SELECT tld::text as tld, COUNT(*) as cnt
FROM domains
GROUP BY tld
) d ON e.enumlabel = d.tld
WHERE e.enumtypid = 'tld_enum'::regtype
ORDER BY e.enumlabel ASC
`)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type TLDInfo struct {
TLD string `json:"tld"`
DomainCount int `json:"domain_count"`
}
var tlds []TLDInfo
for rows.Next() {
var t TLDInfo
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
continue
}
tlds = append(tlds, t)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(tlds)
}
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
search := r.URL.Query().Get("search")
stats := map[string]interface{}{
"tld": tld,
}
// Build WHERE clause based on whether search is provided
var domainWhere, feedWhere string
var domainArgs, feedArgs []interface{}
if search != "" {
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// For domain searches, check for exact match
if sq.Type == "domain" {
hostPart, detectedTLD := parseSearchTerm(sq.Pattern)
if detectedTLD != "" {
// d:npr.org -> exact match for host "npr" in specified TLD
domainWhere = "tld = $1 AND lower(host) = $2"
domainArgs = []interface{}{tld, strings.ToLower(hostPart)}
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) = $2"
feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)}
} else {
// d:npr -> pattern match in specified TLD
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
} else {
// Other search types - pattern match
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
stats["search"] = search
} else {
// Filter by TLD only
domainWhere = "tld = $1"
domainArgs = []interface{}{tld}
feedWhere = "domain_tld = $1"
feedArgs = []interface{}{tld}
}
// Domain stats by status
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE `+domainWhere, domainArgs...).Scan(&totalDomains)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
stats["total_domains"] = totalDomains
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "pass":
passDomains = count
case "skip":
skipDomains = count
case "hold":
holdDomains = count
case "dead":
deadDomains = count
}
}
rows.Close()
stats["pass_domains"] = passDomains
stats["skip_domains"] = skipDomains
stats["hold_domains"] = holdDomains
stats["dead_domains"] = deadDomains
// Feed stats
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE `+feedWhere, feedArgs...).Scan(&totalFeeds)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
stats["total_feeds"] = totalFeeds
// Feed status counts
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for statusRows.Next() {
var status string
var count int
if err := statusRows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "pass":
passFeeds = count
case "skip":
skipFeeds = count
case "hold":
holdFeeds = count
case "dead":
deadFeeds = count
}
}
statusRows.Close()
stats["pass_feeds"] = passFeeds
stats["skip_feeds"] = skipFeeds
stats["hold_feeds"] = holdFeeds
stats["dead_feeds"] = deadFeeds
// Empty feeds count
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
stats["empty_feeds"] = emptyFeeds
// Feed type counts
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for typeRows.Next() {
var feedType string
var count int
if err := typeRows.Scan(&feedType, &count); err != nil {
continue
}
switch feedType {
case "rss":
rssFeeds = count
case "atom":
atomFeeds = count
case "json":
jsonFeeds = count
default:
unknownFeeds += count
}
}
typeRows.Close()
stats["rss_feeds"] = rssFeeds
stats["atom_feeds"] = atomFeeds
stats["json_feeds"] = jsonFeeds
stats["unknown_feeds"] = unknownFeeds
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
search := r.URL.Query().Get("search")
if search == "" {
http.Error(w, "search parameter required", http.StatusBadRequest)
return
}
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
var tldFilter, hostPart string
var exactMatch bool
if sq.Type == "domain" {
hostPart, tldFilter = parseSearchTerm(sq.Pattern)
if tldFilter != "" {
searchPattern = "%" + strings.ToLower(hostPart) + "%"
exactMatch = true
}
}
stats := map[string]interface{}{}
// Build WHERE clause based on search type
var domainWhere, feedWhere string
var domainArgs, feedArgs []interface{}
switch sq.Type {
case "domain":
if exactMatch && tldFilter != "" {
// d:npr.org -> exact match
domainWhere = "tld = $1 AND LOWER(host) = $2"
domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)}
feedWhere = "LOWER(domain_host || '.' || domain_tld) = $1"
feedArgs = []interface{}{strings.ToLower(sq.Pattern)}
} else if tldFilter != "" {
domainWhere = "tld = $1 AND LOWER(host) LIKE $2"
domainArgs = []interface{}{tldFilter, searchPattern}
feedWhere = "domain_tld = $1 AND LOWER(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tldFilter, searchPattern}
} else {
domainWhere = "LOWER(host) LIKE $1"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(domain_host || '.' || domain_tld) LIKE $1"
feedArgs = []interface{}{searchPattern}
}
case "url":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.url) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(url) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "title":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(title) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "description":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.description) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(description) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "item":
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(i.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)"
feedArgs = []interface{}{searchPattern}
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
domainWhere = `(
LOWER(host) LIKE $1 OR
(LOWER(host) = $2 AND tld::text = $3) OR
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
feedWhere = `(
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(domain_host || '.' || domain_tld) = $2
)`
feedArgs = []interface{}{searchPattern, fullDomain}
} else {
domainWhere = `(
LOWER(host) LIKE $1 OR
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern}
feedWhere = `(
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
)`
feedArgs = []interface{}{searchPattern}
}
}
// Count matching domains by status
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
totalDomains += count
switch status {
case "pass":
passDomains = count
case "skip":
skipDomains = count
case "hold":
holdDomains = count
case "dead":
deadDomains = count
}
}
rows.Close()
stats["total_domains"] = totalDomains
stats["pass_domains"] = passDomains
stats["skip_domains"] = skipDomains
stats["hold_domains"] = holdDomains
stats["dead_domains"] = deadDomains
// Count matching feeds by status
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for statusRows.Next() {
var status string
var count int
if err := statusRows.Scan(&status, &count); err != nil {
continue
}
totalFeeds += count
switch status {
case "pass":
passFeeds = count
case "skip":
skipFeeds = count
case "hold":
holdFeeds = count
case "dead":
deadFeeds = count
}
}
statusRows.Close()
stats["total_feeds"] = totalFeeds
stats["pass_feeds"] = passFeeds
stats["skip_feeds"] = skipFeeds
stats["hold_feeds"] = holdFeeds
stats["dead_feeds"] = deadFeeds
// Count empty feeds
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
stats["empty_feeds"] = emptyFeeds
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for typeRows.Next() {
var feedType string
var count int
if err := typeRows.Scan(&feedType, &count); err != nil {
continue
}
switch feedType {
case "rss":
rssFeeds = count
case "atom":
atomFeeds = count
case "json":
jsonFeeds = count
default:
unknownFeeds += count
}
}
typeRows.Close()
stats["rss_feeds"] = rssFeeds
stats["atom_feeds"] = atomFeeds
stats["json_feeds"] = jsonFeeds
stats["unknown_feeds"] = unknownFeeds
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
// handleAPIDenyDomain skips a domain (takedown accounts, preserve data)
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
result := c.skipDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// DomainActionResult contains the results of a domain action
type DomainActionResult struct {
Success bool `json:"success"`
Host string `json:"host"`
Action string `json:"action"`
FeedsAffected int64 `json:"feeds_affected,omitempty"`
ItemsDeleted int64 `json:"items_deleted,omitempty"`
AccountsAffected int `json:"accounts_affected,omitempty"`
AccountErrors []string `json:"account_errors,omitempty"`
Error string `json:"error,omitempty"`
}
// getPDSCredentials loads PDS credentials from environment or pds.env file
func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
pdsHost = os.Getenv("PDS_HOST")
pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD")
if pdsHost == "" || pdsAdminPassword == "" {
if file, err := os.Open("pds.env"); err == nil {
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "PDS_HOST=") {
pdsHost = strings.TrimPrefix(line, "PDS_HOST=")
} else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") {
pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=")
}
}
file.Close()
}
}
return
}
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
func (c *Crawler) getDomainDIDs(host string) []string {
domainHost := stripTLD(host)
domainTLD := getTLD(host)
var dids []string
rows, err := c.db.Query(`
SELECT DISTINCT publish_account FROM feeds
WHERE domain_host = $1 AND domain_tld = $2 AND publish_account IS NOT NULL AND publish_account != ''
`, domainHost, domainTLD)
if err == nil {
defer rows.Close()
for rows.Next() {
var did string
if err := rows.Scan(&did); err == nil && did != "" {
dids = append(dids, did)
}
}
}
return dids
}
// skipDomain sets a domain to skip, takes down PDS accounts but preserves all data
func (c *Crawler) skipDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "skip"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Takedown PDS accounts (hide content but preserve data)
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Mark feeds as skipped (but don't delete)
domainHost := stripTLD(host)
domainTLD := getTLD(host)
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'skip', publish_status = 'skip'
WHERE domain_host = $1 AND domain_tld = $2
`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
}
result.FeedsAffected = feedsAffected
// Update domain status to skip
_, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}
// handleAPIDropDomain permanently deletes all data for a skipped domain
func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Verify domain is currently skipped
var status string
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
if err != nil {
http.Error(w, "domain not found", http.StatusNotFound)
return
}
if status != "skip" {
http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest)
return
}
result := c.dropDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts)
func (c *Crawler) dropDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "drop"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Delete PDS accounts
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Get feed URLs for this domain (needed to delete items)
domainHost := stripTLD(host)
domainTLD := getTLD(host)
var feedURLs []string
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
var url string
if err := feedRows.Scan(&url); err == nil {
feedURLs = append(feedURLs, url)
}
}
}
// Delete items for all feeds from this domain
for _, feedURL := range feedURLs {
deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL)
if err == nil {
result.ItemsDeleted += deleted
}
}
// Delete all feeds from this domain
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
return result
}
result.FeedsAffected = feedsDeleted
// Update domain status to drop
_, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}
// handleAPIUndenyDomain removes skip status from a domain (restores accounts)
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Verify domain is currently skipped
var status string
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
if err != nil {
http.Error(w, "domain not found", http.StatusNotFound)
return
}
if status != "skip" {
http.Error(w, "domain is not skipped", http.StatusBadRequest)
return
}
result := c.restoreDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// restoreDomain removes skip status and restores PDS accounts
func (c *Crawler) restoreDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "restore"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Restore PDS accounts (remove takedown)
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Restore feeds to pass status
domainHost := stripTLD(host)
domainTLD := getTLD(host)
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'pass', publish_status = 'pass'
WHERE domain_host = $1 AND domain_tld = $2
`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
}
result.FeedsAffected = feedsAffected
// Update domain status back to pass
_, err = c.db.Exec(`
UPDATE domains SET status = 'pass', last_error = NULL
WHERE host = $1 AND tld = $2
`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}