Files
crawler/api_domains.go
primal 81146fd572 Fix domain search when pattern looks like domain
When searching for "npr.org" and viewing the .org TLD, use the host part
("npr") for matching instead of the full pattern ("npr.org").

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 19:19:21 -05:00

2057 lines
62 KiB
Go

package main
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"os"
"strings"
"github.com/jackc/pgx/v5"
)
// buildTLDSearchQuery builds a query to get TLDs based on search type
// Returns (query, args) for the database query
func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
switch sq.Type {
case "domain":
// Check if pattern includes TLD (e.g., d:npr.org -> exact match)
hostPart, tldFilter := parseSearchTerm(sq.Pattern)
if tldFilter != "" {
// Exact match - return just the matching TLD
return `
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{tldFilter, strings.ToLower(hostPart)}
}
// Pattern match - search all TLDs
return `
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE LOWER(host) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "url":
// Search feed URL paths (after domain)
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "title":
// Search feed titles
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "description":
// Search feed descriptions
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
case "item":
// Search item titles
return `
SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count
FROM feeds f
INNER JOIN items i ON i.feed_url = f.url
WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1
GROUP BY f.tld
ORDER BY f.tld ASC
`, []interface{}{pattern}
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
-- Domains matching host pattern
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
UNION
-- Exact domain match
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) = $2 AND tld::text = $3
UNION
-- Feeds matching URL
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
}
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
-- Domains matching host
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
UNION
-- Feeds matching URL
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern}
}
}
// buildDomainSearchQuery builds a query to get domains based on search type
// Returns (whereClause, args, argNum) to append to the base query
func buildDomainSearchQuery(sq SearchQuery, tldFilter string, argNum int) (string, []interface{}, int) {
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
var where string
var args []interface{}
switch sq.Type {
case "domain":
if sq.ExactMatch && tldFilter != "" {
// d:npr.org -> exact match
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) = $%d", argNum, argNum+1)
args = []interface{}{tldFilter, strings.ToLower(sq.Pattern)}
argNum += 2
} else if tldFilter != "" {
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) LIKE $%d", argNum, argNum+1)
args = []interface{}{tldFilter, pattern}
argNum += 2
} else {
where = fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
}
case "url":
where = fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "title":
where = fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "description":
where = fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
case "item":
// Need to join items - handled separately
where = fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
args = []interface{}{pattern}
argNum++
if tldFilter != "" {
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
default:
// "all" - search everything, also include exact domain match if pattern looks like a domain
if tldFilter != "" {
if sq.DomainHost != "" && sq.DomainTLD != "" {
where = fmt.Sprintf(` AND d.tld = $%d AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1, argNum+2, argNum+3)
args = []interface{}{tldFilter, pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
argNum += 4
} else {
where = fmt.Sprintf(` AND d.tld = $%d AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1)
args = []interface{}{tldFilter, pattern}
argNum += 2
}
} else {
if sq.DomainHost != "" && sq.DomainTLD != "" {
where = fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
args = []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
argNum += 3
} else {
where = fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = []interface{}{pattern}
argNum++
}
}
}
return where, args, argNum
}
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
// handleAPIDomains lists domains with optional status filter, including their feeds
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
search := r.URL.Query().Get("search")
tldFilter := r.URL.Query().Get("tld")
feedMode := r.URL.Query().Get("feedMode") // include or exclude
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse comma-separated values
var statusList, typeList []string
if feedStatuses != "" {
statusList = strings.Split(feedStatuses, ",")
}
if feedTypes != "" {
typeList = strings.Split(feedTypes, ",")
}
// Parse search prefix for type-specific searching
var searchQuery SearchQuery
if search != "" {
searchQuery = parseSearchPrefix(search)
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
// All other searches use the literal pattern
if searchQuery.Type == "domain" {
hostPart, detectedTLD := parseSearchTerm(searchQuery.Pattern)
if detectedTLD != "" {
searchQuery.Pattern = hostPart
searchQuery.ExactMatch = true // d:npr.org matches exactly npr.org
if tldFilter == "" {
tldFilter = detectedTLD
}
}
}
}
// First get domains
var rows pgx.Rows
var err error
// If feed filter is specified, query domains that have matching feeds
if len(statusList) > 0 || len(typeList) > 0 || feedMode != "" {
// Build dynamic query to get domains with matching feeds
query := `
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found
FROM domains d
INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld)
WHERE 1=1`
args := []interface{}{}
argNum := 1
if tldFilter != "" {
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tldFilter)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND d.status = $%d", argNum)
args = append(args, status)
argNum++
}
// Handle status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
query += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
} else {
query += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", argNum)
}
args = append(args, statusList)
argNum++
}
// Handle type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
// Exclude mode
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND f.item_count > 0"
}
} else {
// Include mode
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND (f.item_count IS NULL OR f.item_count = 0)"
}
}
}
if search != "" && searchQuery.Pattern != "" {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
switch searchQuery.Type {
case "domain":
if searchQuery.ExactMatch {
// d:npr.org -> exact match for host "npr" (tld already filtered above)
query += fmt.Sprintf(" AND LOWER(d.host) = $%d", argNum)
args = append(args, strings.ToLower(searchQuery.Pattern))
} else {
// d:npr -> pattern match
query += fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
args = append(args, searchPattern)
}
argNum++
case "url":
query += fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "title":
query += fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "description":
query += fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
args = append(args, searchPattern)
argNum++
case "item":
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
args = append(args, searchPattern)
argNum++
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if searchQuery.DomainHost != "" && searchQuery.DomainTLD != "" {
query += fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d OR
(LOWER(d.host) = $%d AND d.tld::text = $%d)
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
args = append(args, searchPattern, strings.ToLower(searchQuery.DomainHost), strings.ToLower(searchQuery.DomainTLD))
argNum += 3
} else {
query += fmt.Sprintf(` AND (
LOWER(d.host) LIKE $%d OR
LOWER(f.url) LIKE $%d OR
LOWER(f.title) LIKE $%d OR
LOWER(f.description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = append(args, searchPattern)
argNum++
}
}
}
query += fmt.Sprintf(" ORDER BY d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
args = append(args, limit, offset)
rows, err = c.db.Query(query, args...)
} else if hasFeeds {
// Only domains with feeds
searchPattern := "%" + strings.ToLower(search) + "%"
if tldFilter != "" && status != "" {
// Filter by specific TLD and status
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
WHERE d.tld = $1 AND d.status = $2
ORDER BY d.host ASC
LIMIT $3 OFFSET $4
`, tldFilter, status, limit, offset)
} else if tldFilter != "" {
// Filter by specific TLD only (exclude 'skip' by default)
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
WHERE d.status != 'skip' AND d.tld = $1
ORDER BY d.host ASC
LIMIT $2 OFFSET $3
`, tldFilter, limit, offset)
} else if search != "" {
// Search in domain host only (uses trigram index)
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, searchPattern, limit, offset)
} else if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
WHERE d.status != 'skip'
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
} else if tldFilter != "" && search != "" && status != "" {
// Filter by TLD, status, and search
if searchQuery.ExactMatch {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) = $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, strings.ToLower(searchQuery.Pattern), limit, offset)
} else if searchQuery.DomainHost != "" && strings.ToLower(searchQuery.DomainTLD) == strings.ToLower(tldFilter) {
// Domain-like search with matching TLD - search for exact host
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) = $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, strings.ToLower(searchQuery.DomainHost), limit, offset)
} else {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2 AND LOWER(host) LIKE $3
ORDER BY host ASC
LIMIT $4 OFFSET $5
`, tldFilter, status, searchPattern, limit, offset)
}
} else if tldFilter != "" && search != "" {
// Filter by TLD and search
// If search looks like a domain with matching TLD, use DomainHost for exact/pattern match
if searchQuery.ExactMatch {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, strings.ToLower(searchQuery.Pattern), limit, offset)
} else if searchQuery.DomainHost != "" && strings.ToLower(searchQuery.DomainTLD) == strings.ToLower(tldFilter) {
// Domain-like search with matching TLD - search for exact host or pattern
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, strings.ToLower(searchQuery.DomainHost), limit, offset)
} else {
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND LOWER(host) LIKE $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, searchPattern, limit, offset)
}
} else if tldFilter != "" && status != "" {
// Filter by TLD and status
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1 AND status = $2
ORDER BY host ASC
LIMIT $3 OFFSET $4
`, tldFilter, status, limit, offset)
} else if tldFilter != "" {
// Filter by TLD only (show all statuses)
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE tld = $1
ORDER BY host ASC
LIMIT $2 OFFSET $3
`, tldFilter, limit, offset)
} else if status != "" {
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status = $1
ORDER BY tld ASC, host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
// Default: exclude 'skip' status domains
rows, err = c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status != 'skip'
ORDER BY tld ASC, host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type,omitempty"`
Status string `json:"status,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
ItemCount int `json:"item_count,omitempty"`
}
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
Feeds []FeedInfo `json:"feeds,omitempty"`
}
var domains []DomainInfo
var hosts []string
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
// Build full domain for feed lookup (source_host = host.tld)
fullDomain := d.Host
if d.TLD != "" {
fullDomain = d.Host + "." + d.TLD
}
hosts = append(hosts, fullDomain)
}
// Now get feeds for these domains (with actual item count from items table)
// Apply the same feed filters used for domain selection
if len(hosts) > 0 {
feedQuery := `
SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
FROM feeds f
WHERE f.source_host = ANY($1)`
feedArgs := []interface{}{hosts}
feedArgNum := 2
// Apply feed status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
feedQuery += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
} else {
feedQuery += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", feedArgNum)
}
feedArgs = append(feedArgs, statusList)
feedArgNum++
}
// Apply feed type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
if len(regularTypes) > 0 && hasEmpty {
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if len(regularTypes) > 0 {
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if hasEmpty {
feedQuery += " AND f.item_count > 0"
}
} else {
if len(regularTypes) > 0 && hasEmpty {
feedQuery += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if len(regularTypes) > 0 {
feedQuery += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", feedArgNum)
feedArgs = append(feedArgs, regularTypes)
feedArgNum++
} else if hasEmpty {
feedQuery += " AND (f.item_count IS NULL OR f.item_count = 0)"
}
}
}
feedQuery += " ORDER BY f.source_host, f.url"
feedRows, err := c.db.Query(feedQuery, feedArgs...)
if err == nil {
defer feedRows.Close()
feedsByHost := make(map[string][]FeedInfo)
for feedRows.Next() {
var host string
var f FeedInfo
var title, feedType, status, publishStatus, language *string
var itemCount *int
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus, &language, &itemCount); err != nil {
continue
}
f.Title = StringValue(title)
f.Type = StringValue(feedType)
f.Status = StringValue(status)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if itemCount != nil {
f.ItemCount = *itemCount
}
feedsByHost[host] = append(feedsByHost[host], f)
}
// Attach feeds to domains (feedsByHost is keyed by full domain)
for i := range domains {
fullHost := domains[i].Host
if domains[i].TLD != "" {
fullHost = domains[i].Host + "." + domains[i].TLD
}
if feeds, ok := feedsByHost[fullHost]; ok {
domains[i].Feeds = feeds
}
}
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
if status == "" {
http.Error(w, "status parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE status = $1
ORDER BY tld ASC, host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT url, title, type, status, last_error, item_count, publish_status, language
FROM feeds
WHERE source_host = $1
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, host, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Status string `json:"status,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, status, lastError, publishStatus, language *string
var itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil {
continue
}
f.Title = StringValue(title)
f.Status = StringValue(status)
f.LastError = StringValue(lastError)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
// handleAPISetDomainStatus sets the status for a domain
// status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop')
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
status := r.URL.Query().Get("status")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
if status != "hold" && status != "pass" && status != "skip" {
http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Setting to 'skip' triggers takedown (hide content but preserve data)
if status == "skip" {
result := c.skipDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
return
}
// When setting to pass, clear any last_error
var err error
strippedHost := stripTLD(host)
tld := getTLD(host)
if status == "pass" {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_error = NULL
WHERE host = $2 AND tld = $3
`, status, strippedHost, tld)
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = $1
WHERE host = $2 AND tld = $3
`, status, strippedHost, tld)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"host": host,
"status": status,
})
}
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
_, err := c.db.Exec(`
UPDATE domains SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
WHERE host = $1 AND tld = $2
`, stripTLD(host), getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
}
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Add domain if it doesn't exist, or reset to pass for crawling
_, err := c.db.Exec(`
INSERT INTO domains (host, status, tld)
VALUES ($1, 'pass', $2)
ON CONFLICT(host, tld) DO UPDATE SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
`, stripTLD(host), getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Crawl synchronously
fmt.Printf("Priority crawl: %s\n", host)
feedsFound, crawlErr := c.feedCrawl(host)
errStr := ""
if crawlErr != nil {
errStr = crawlErr.Error()
}
// Mark as crawled
c.markDomainCrawled(stripTLD(host), getTLD(host), feedsFound, errStr)
// Get the feeds we found
feeds, _ := c.GetFeedsByHost(host)
type FeedSummary struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Category string `json:"category"`
Status string `json:"status"`
}
var feedSummaries []FeedSummary
for _, f := range feeds {
feedSummaries = append(feedSummaries, FeedSummary{
URL: f.URL,
Title: f.Title,
Type: f.Type,
Category: f.Category,
Status: f.Status,
})
}
result := map[string]interface{}{
"host": host,
"feeds_found": feedsFound,
"feeds": feedSummaries,
}
if crawlErr != nil {
result["error"] = crawlErr.Error()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// handleAPIFilter handles flexible filtering with stackable parameters
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
domain := r.URL.Query().Get("domain")
feedStatus := r.URL.Query().Get("feedStatus")
domainStatus := r.URL.Query().Get("domainStatus")
languages := r.URL.Query().Get("languages") // comma-separated list
show := r.URL.Query().Get("show") // "feeds" or "domains"
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse languages into slice
var langList []string
if languages != "" {
for _, lang := range strings.Split(languages, ",") {
lang = strings.TrimSpace(lang)
if lang != "" {
langList = append(langList, lang)
}
}
}
// Determine what to show based on filters
if show == "" {
if feedStatus != "" || domain != "" || len(langList) > 0 {
show = "feeds"
} else {
show = "domains"
}
}
if show == "feeds" {
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
} else {
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
}
}
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
var args []interface{}
argNum := 1
query := `
SELECT host, tld, status, last_error, feeds_found
FROM domains
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND status = $%d", argNum)
args = append(args, status)
argNum++
}
// Sort by feed count descending or alphabetically
if sort == "feeds" {
query += fmt.Sprintf(" ORDER BY feeds_found DESC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
} else {
query += fmt.Sprintf(" ORDER BY tld ASC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
}
args = append(args, limit, offset)
rows, err := c.db.Query(query, args...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tldVal, lastError *string
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tldVal)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"type": "domains",
"data": domains,
})
}
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT host, status, last_error, feeds_found
FROM domains
WHERE tld = $1
ORDER BY host ASC
LIMIT $2 OFFSET $3
`, tld, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var lastError *string
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status") // domain status: pass, skip, hold, dead
feedMode := r.URL.Query().Get("feedMode") // include or exclude
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated: pass,skip,hold,dead
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated: rss,atom,json,unknown,empty
search := r.URL.Query().Get("search") // search query
// Parse comma-separated values
var statusList, typeList []string
if feedStatuses != "" {
statusList = strings.Split(feedStatuses, ",")
}
if feedTypes != "" {
typeList = strings.Split(feedTypes, ",")
}
var rows pgx.Rows
var err error
// If feed filter is specified, query from feeds table instead
if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" {
// Build query to get TLDs from feeds
query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL`
args := []interface{}{}
argNum := 1
// Handle status filters (publish_status for pass/skip/hold/dead)
if len(statusList) > 0 {
if feedMode == "exclude" {
query += fmt.Sprintf(" AND (publish_status IS NULL OR publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
} else {
query += fmt.Sprintf(" AND publish_status IN (SELECT unnest($%d::text[]))", argNum)
}
args = append(args, statusList)
argNum++
}
// Handle type filters (including special "empty" type)
if len(typeList) > 0 {
hasEmpty := false
var regularTypes []string
for _, t := range typeList {
if t == "empty" {
hasEmpty = true
} else {
regularTypes = append(regularTypes, t)
}
}
if feedMode == "exclude" {
// Exclude mode: exclude these types
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND type NOT IN (SELECT unnest($%d::text[])) AND item_count > 0", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND (type IS NULL OR type NOT IN (SELECT unnest($%d::text[])))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND item_count > 0"
}
} else {
// Include mode: include these types
if len(regularTypes) > 0 && hasEmpty {
query += fmt.Sprintf(" AND (type IN (SELECT unnest($%d::text[])) OR item_count IS NULL OR item_count = 0)", argNum)
args = append(args, regularTypes)
argNum++
} else if len(regularTypes) > 0 {
query += fmt.Sprintf(" AND type IN (SELECT unnest($%d::text[]))", argNum)
args = append(args, regularTypes)
argNum++
} else if hasEmpty {
query += " AND (item_count IS NULL OR item_count = 0)"
}
}
}
if search != "" {
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
var tldFilter string
var exactMatch bool
hostSearchPattern := searchPattern
if sq.Type == "domain" {
hostPattern, detectedTLD := parseSearchTerm(sq.Pattern)
if detectedTLD != "" {
tldFilter = detectedTLD
exactMatch = true
hostSearchPattern = "%" + strings.ToLower(hostPattern) + "%"
}
}
switch sq.Type {
case "domain":
// Search domain names
if exactMatch && tldFilter != "" {
// d:npr.org -> exact match (source_host = 'npr.org')
query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum)
args = append(args, strings.ToLower(sq.Pattern))
} else if tldFilter != "" {
query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1)
args = append(args, tldFilter, hostSearchPattern)
} else {
query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum)
args = append(args, hostSearchPattern)
}
case "url":
query += fmt.Sprintf(" AND LOWER(url) LIKE $%d", argNum)
args = append(args, searchPattern)
case "title":
query += fmt.Sprintf(" AND LOWER(title) LIKE $%d", argNum)
args = append(args, searchPattern)
case "description":
query += fmt.Sprintf(" AND LOWER(description) LIKE $%d", argNum)
args = append(args, searchPattern)
case "item":
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = feeds.url AND LOWER(i.title) LIKE $%d)", argNum)
args = append(args, searchPattern)
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
query += fmt.Sprintf(` AND (
LOWER(source_host) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d OR
LOWER(source_host) = $%d
)`, argNum, argNum, argNum, argNum, argNum+1)
args = append(args, searchPattern, fullDomain)
} else {
query += fmt.Sprintf(` AND (
LOWER(source_host) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d
)`, argNum, argNum, argNum, argNum)
args = append(args, searchPattern)
}
}
}
query += " GROUP BY tld ORDER BY tld ASC"
rows, err = c.db.Query(query, args...)
} else if search != "" {
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
// Use the helper to build the TLD search query
query, args := buildTLDSearchQuery(sq)
rows, err = c.db.Query(query, args...)
} else if status != "" {
// TLDs filtered by domain status
rows, err = c.db.Query(`
SELECT tld::text as tld, COUNT(*) as domain_count
FROM domains
WHERE tld IS NOT NULL AND status = $1
GROUP BY tld
HAVING COUNT(*) > 0
ORDER BY tld ASC
`, status)
} else {
// All TLDs from enum with domain counts
rows, err = c.db.Query(`
SELECT e.enumlabel as tld, COALESCE(d.cnt, 0) as domain_count
FROM pg_enum e
LEFT JOIN (
SELECT tld::text as tld, COUNT(*) as cnt
FROM domains
GROUP BY tld
) d ON e.enumlabel = d.tld
WHERE e.enumtypid = 'tld_enum'::regtype
ORDER BY e.enumlabel ASC
`)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type TLDInfo struct {
TLD string `json:"tld"`
DomainCount int `json:"domain_count"`
}
var tlds []TLDInfo
for rows.Next() {
var t TLDInfo
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
continue
}
tlds = append(tlds, t)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(tlds)
}
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
search := r.URL.Query().Get("search")
stats := map[string]interface{}{
"tld": tld,
}
// Build WHERE clause based on whether search is provided
var domainWhere, feedWhere string
var domainArgs, feedArgs []interface{}
if search != "" {
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// For domain searches, check for exact match
if sq.Type == "domain" {
hostPart, detectedTLD := parseSearchTerm(sq.Pattern)
if detectedTLD != "" {
// d:npr.org -> exact match for host "npr" in specified TLD
domainWhere = "tld = $1 AND lower(host) = $2"
domainArgs = []interface{}{tld, strings.ToLower(hostPart)}
feedWhere = "tld = $1 AND lower(source_host) = $2"
feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)}
} else {
// d:npr -> pattern match in specified TLD
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
} else {
// Other search types - pattern match
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
stats["search"] = search
} else {
// Filter by TLD only
domainWhere = "tld = $1"
domainArgs = []interface{}{tld}
feedWhere = "tld = $1"
feedArgs = []interface{}{tld}
}
// Domain stats by status
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE `+domainWhere, domainArgs...).Scan(&totalDomains)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
stats["total_domains"] = totalDomains
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "pass":
passDomains = count
case "skip":
skipDomains = count
case "hold":
holdDomains = count
case "dead":
deadDomains = count
}
}
rows.Close()
stats["pass_domains"] = passDomains
stats["skip_domains"] = skipDomains
stats["hold_domains"] = holdDomains
stats["dead_domains"] = deadDomains
// Feed stats
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE `+feedWhere, feedArgs...).Scan(&totalFeeds)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
stats["total_feeds"] = totalFeeds
// Feed status counts
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for statusRows.Next() {
var status string
var count int
if err := statusRows.Scan(&status, &count); err != nil {
continue
}
switch status {
case "pass":
passFeeds = count
case "skip":
skipFeeds = count
case "hold":
holdFeeds = count
case "dead":
deadFeeds = count
}
}
statusRows.Close()
stats["pass_feeds"] = passFeeds
stats["skip_feeds"] = skipFeeds
stats["hold_feeds"] = holdFeeds
stats["dead_feeds"] = deadFeeds
// Empty feeds count
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
stats["empty_feeds"] = emptyFeeds
// Feed type counts
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for typeRows.Next() {
var feedType string
var count int
if err := typeRows.Scan(&feedType, &count); err != nil {
continue
}
switch feedType {
case "rss":
rssFeeds = count
case "atom":
atomFeeds = count
case "json":
jsonFeeds = count
default:
unknownFeeds += count
}
}
typeRows.Close()
stats["rss_feeds"] = rssFeeds
stats["atom_feeds"] = atomFeeds
stats["json_feeds"] = jsonFeeds
stats["unknown_feeds"] = unknownFeeds
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
search := r.URL.Query().Get("search")
if search == "" {
http.Error(w, "search parameter required", http.StatusBadRequest)
return
}
// Parse search prefix for type-specific searching
sq := parseSearchPrefix(search)
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
var tldFilter, hostPart string
var exactMatch bool
if sq.Type == "domain" {
hostPart, tldFilter = parseSearchTerm(sq.Pattern)
if tldFilter != "" {
searchPattern = "%" + strings.ToLower(hostPart) + "%"
exactMatch = true
}
}
stats := map[string]interface{}{}
// Build WHERE clause based on search type
var domainWhere, feedWhere string
var domainArgs, feedArgs []interface{}
switch sq.Type {
case "domain":
if exactMatch && tldFilter != "" {
// d:npr.org -> exact match
domainWhere = "tld = $1 AND LOWER(host) = $2"
domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)}
feedWhere = "LOWER(source_host) = $1"
feedArgs = []interface{}{strings.ToLower(sq.Pattern)}
} else if tldFilter != "" {
domainWhere = "tld = $1 AND LOWER(host) LIKE $2"
domainArgs = []interface{}{tldFilter, searchPattern}
feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2"
feedArgs = []interface{}{tldFilter, searchPattern}
} else {
domainWhere = "LOWER(host) LIKE $1"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(source_host) LIKE $1"
feedArgs = []interface{}{searchPattern}
}
case "url":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(url) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "title":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(title) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "description":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(description) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "item":
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)"
feedArgs = []interface{}{searchPattern}
default:
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
domainWhere = `(
LOWER(host) LIKE $1 OR
(LOWER(host) = $2 AND tld::text = $3) OR
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
feedWhere = `(
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2
)`
feedArgs = []interface{}{searchPattern, fullDomain}
} else {
domainWhere = `(
LOWER(host) LIKE $1 OR
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern}
feedWhere = `(
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
)`
feedArgs = []interface{}{searchPattern}
}
}
// Count matching domains by status
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
continue
}
totalDomains += count
switch status {
case "pass":
passDomains = count
case "skip":
skipDomains = count
case "hold":
holdDomains = count
case "dead":
deadDomains = count
}
}
rows.Close()
stats["total_domains"] = totalDomains
stats["pass_domains"] = passDomains
stats["skip_domains"] = skipDomains
stats["hold_domains"] = holdDomains
stats["dead_domains"] = deadDomains
// Count matching feeds by status
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for statusRows.Next() {
var status string
var count int
if err := statusRows.Scan(&status, &count); err != nil {
continue
}
totalFeeds += count
switch status {
case "pass":
passFeeds = count
case "skip":
skipFeeds = count
case "hold":
holdFeeds = count
case "dead":
deadFeeds = count
}
}
statusRows.Close()
stats["total_feeds"] = totalFeeds
stats["pass_feeds"] = passFeeds
stats["skip_feeds"] = skipFeeds
stats["hold_feeds"] = holdFeeds
stats["dead_feeds"] = deadFeeds
// Count empty feeds
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
stats["empty_feeds"] = emptyFeeds
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
for typeRows.Next() {
var feedType string
var count int
if err := typeRows.Scan(&feedType, &count); err != nil {
continue
}
switch feedType {
case "rss":
rssFeeds = count
case "atom":
atomFeeds = count
case "json":
jsonFeeds = count
default:
unknownFeeds += count
}
}
typeRows.Close()
stats["rss_feeds"] = rssFeeds
stats["atom_feeds"] = atomFeeds
stats["json_feeds"] = jsonFeeds
stats["unknown_feeds"] = unknownFeeds
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
// handleAPIDenyDomain skips a domain (takedown accounts, preserve data)
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
result := c.skipDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// DomainActionResult contains the results of a domain action
type DomainActionResult struct {
Success bool `json:"success"`
Host string `json:"host"`
Action string `json:"action"`
FeedsAffected int64 `json:"feeds_affected,omitempty"`
ItemsDeleted int64 `json:"items_deleted,omitempty"`
AccountsAffected int `json:"accounts_affected,omitempty"`
AccountErrors []string `json:"account_errors,omitempty"`
Error string `json:"error,omitempty"`
}
// getPDSCredentials loads PDS credentials from environment or pds.env file
func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
pdsHost = os.Getenv("PDS_HOST")
pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD")
if pdsHost == "" || pdsAdminPassword == "" {
if file, err := os.Open("pds.env"); err == nil {
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "PDS_HOST=") {
pdsHost = strings.TrimPrefix(line, "PDS_HOST=")
} else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") {
pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=")
}
}
file.Close()
}
}
return
}
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
func (c *Crawler) getDomainDIDs(host string) []string {
var dids []string
rows, err := c.db.Query(`
SELECT DISTINCT publish_account FROM feeds
WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != ''
`, host)
if err == nil {
defer rows.Close()
for rows.Next() {
var did string
if err := rows.Scan(&did); err == nil && did != "" {
dids = append(dids, did)
}
}
}
return dids
}
// skipDomain sets a domain to skip, takes down PDS accounts but preserves all data
func (c *Crawler) skipDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "skip"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Takedown PDS accounts (hide content but preserve data)
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Mark feeds as skipped (but don't delete)
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'skip', publish_status = 'skip'
WHERE source_host = $1
`, host)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
}
result.FeedsAffected = feedsAffected
// Update domain status to skip
_, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}
// handleAPIDropDomain permanently deletes all data for a skipped domain
func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Verify domain is currently skipped
var status string
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
if err != nil {
http.Error(w, "domain not found", http.StatusNotFound)
return
}
if status != "skip" {
http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest)
return
}
result := c.dropDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts)
func (c *Crawler) dropDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "drop"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Delete PDS accounts
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Get feed URLs for this domain (needed to delete items)
var feedURLs []string
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
var url string
if err := feedRows.Scan(&url); err == nil {
feedURLs = append(feedURLs, url)
}
}
}
// Delete items for all feeds from this domain
for _, feedURL := range feedURLs {
deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL)
if err == nil {
result.ItemsDeleted += deleted
}
}
// Delete all feeds from this domain
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host)
if err != nil {
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
return result
}
result.FeedsAffected = feedsDeleted
// Update domain status to drop
_, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}
// handleAPIUndenyDomain removes skip status from a domain (restores accounts)
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Verify domain is currently skipped
var status string
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
if err != nil {
http.Error(w, "domain not found", http.StatusNotFound)
return
}
if status != "skip" {
http.Error(w, "domain is not skipped", http.StatusBadRequest)
return
}
result := c.restoreDomain(host)
if result.Error != "" {
http.Error(w, result.Error, http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// restoreDomain removes skip status and restores PDS accounts
func (c *Crawler) restoreDomain(host string) DomainActionResult {
result := DomainActionResult{Host: host, Action: "restore"}
pdsHost, pdsAdminPassword := getPDSCredentials()
dids := c.getDomainDIDs(host)
// Restore PDS accounts (remove takedown)
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
publisher := NewPublisher(pdsHost)
for _, did := range dids {
if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil {
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
} else {
result.AccountsAffected++
}
}
}
// Restore feeds to pass status
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'pass', publish_status = 'pass'
WHERE source_host = $1
`, host)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
}
result.FeedsAffected = feedsAffected
// Update domain status back to pass
_, err = c.db.Exec(`
UPDATE domains SET status = 'pass', last_error = NULL
WHERE host = $1 AND tld = $2
`, stripTLD(host), getTLD(host))
if err != nil {
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
return result
}
result.Success = true
return result
}