When searching for patterns like "npr.org", the search now also matches the exact domain (host=npr, tld=org) in addition to the existing text search across domain names, feed URLs, titles, and descriptions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2038 lines
61 KiB
Go
2038 lines
61 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
// buildTLDSearchQuery builds a query to get TLDs based on search type
|
|
// Returns (query, args) for the database query
|
|
func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
|
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
|
|
|
|
switch sq.Type {
|
|
case "domain":
|
|
// Check if pattern includes TLD (e.g., d:npr.org -> exact match)
|
|
hostPart, tldFilter := parseSearchTerm(sq.Pattern)
|
|
if tldFilter != "" {
|
|
// Exact match - return just the matching TLD
|
|
return `
|
|
SELECT tld::text as tld, COUNT(*) as domain_count
|
|
FROM domains
|
|
WHERE tld = $1 AND LOWER(host) = $2
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{tldFilter, strings.ToLower(hostPart)}
|
|
}
|
|
// Pattern match - search all TLDs
|
|
return `
|
|
SELECT tld::text as tld, COUNT(*) as domain_count
|
|
FROM domains
|
|
WHERE LOWER(host) LIKE $1
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern}
|
|
|
|
case "url":
|
|
// Search feed URL paths (after domain)
|
|
return `
|
|
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
|
FROM feeds
|
|
WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern}
|
|
|
|
case "title":
|
|
// Search feed titles
|
|
return `
|
|
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
|
FROM feeds
|
|
WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern}
|
|
|
|
case "description":
|
|
// Search feed descriptions
|
|
return `
|
|
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
|
FROM feeds
|
|
WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern}
|
|
|
|
case "item":
|
|
// Search item titles
|
|
return `
|
|
SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count
|
|
FROM feeds f
|
|
INNER JOIN items i ON i.feed_url = f.url
|
|
WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1
|
|
GROUP BY f.tld
|
|
ORDER BY f.tld ASC
|
|
`, []interface{}{pattern}
|
|
|
|
default:
|
|
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
|
|
// Also include exact domain match if pattern looks like a domain
|
|
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
|
return `
|
|
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
|
|
-- Domains matching host pattern
|
|
SELECT tld::text as tld, host || '.' || tld as source_host
|
|
FROM domains WHERE LOWER(host) LIKE $1
|
|
UNION
|
|
-- Exact domain match
|
|
SELECT tld::text as tld, host || '.' || tld as source_host
|
|
FROM domains WHERE LOWER(host) = $2 AND LOWER(tld) = $3
|
|
UNION
|
|
-- Feeds matching URL
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
|
UNION
|
|
-- Feeds matching title
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
|
UNION
|
|
-- Feeds matching description
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
|
) combined
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
|
}
|
|
return `
|
|
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
|
|
-- Domains matching host
|
|
SELECT tld::text as tld, host || '.' || tld as source_host
|
|
FROM domains WHERE LOWER(host) LIKE $1
|
|
UNION
|
|
-- Feeds matching URL
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
|
UNION
|
|
-- Feeds matching title
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
|
UNION
|
|
-- Feeds matching description
|
|
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
|
) combined
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`, []interface{}{pattern}
|
|
}
|
|
}
|
|
|
|
// buildDomainSearchQuery builds a query to get domains based on search type
|
|
// Returns (whereClause, args, argNum) to append to the base query
|
|
func buildDomainSearchQuery(sq SearchQuery, tldFilter string, argNum int) (string, []interface{}, int) {
|
|
pattern := "%" + strings.ToLower(sq.Pattern) + "%"
|
|
var where string
|
|
var args []interface{}
|
|
|
|
switch sq.Type {
|
|
case "domain":
|
|
if sq.ExactMatch && tldFilter != "" {
|
|
// d:npr.org -> exact match
|
|
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) = $%d", argNum, argNum+1)
|
|
args = []interface{}{tldFilter, strings.ToLower(sq.Pattern)}
|
|
argNum += 2
|
|
} else if tldFilter != "" {
|
|
where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) LIKE $%d", argNum, argNum+1)
|
|
args = []interface{}{tldFilter, pattern}
|
|
argNum += 2
|
|
} else {
|
|
where = fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
}
|
|
|
|
case "url":
|
|
where = fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
if tldFilter != "" {
|
|
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tldFilter)
|
|
argNum++
|
|
}
|
|
|
|
case "title":
|
|
where = fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
if tldFilter != "" {
|
|
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tldFilter)
|
|
argNum++
|
|
}
|
|
|
|
case "description":
|
|
where = fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
if tldFilter != "" {
|
|
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tldFilter)
|
|
argNum++
|
|
}
|
|
|
|
case "item":
|
|
// Need to join items - handled separately
|
|
where = fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
if tldFilter != "" {
|
|
where += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tldFilter)
|
|
argNum++
|
|
}
|
|
|
|
default:
|
|
// "all" - search everything, also include exact domain match if pattern looks like a domain
|
|
if tldFilter != "" {
|
|
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
|
where = fmt.Sprintf(` AND d.tld = $%d AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d OR
|
|
(LOWER(d.host) = $%d AND LOWER(d.tld) = $%d)
|
|
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1, argNum+2, argNum+3)
|
|
args = []interface{}{tldFilter, pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
|
argNum += 4
|
|
} else {
|
|
where = fmt.Sprintf(` AND d.tld = $%d AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d
|
|
)`, argNum, argNum+1, argNum+1, argNum+1, argNum+1)
|
|
args = []interface{}{tldFilter, pattern}
|
|
argNum += 2
|
|
}
|
|
} else {
|
|
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
|
where = fmt.Sprintf(` AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d OR
|
|
(LOWER(d.host) = $%d AND LOWER(d.tld) = $%d)
|
|
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
|
|
args = []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
|
argNum += 3
|
|
} else {
|
|
where = fmt.Sprintf(` AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d
|
|
)`, argNum, argNum, argNum, argNum)
|
|
args = []interface{}{pattern}
|
|
argNum++
|
|
}
|
|
}
|
|
}
|
|
|
|
return where, args, argNum
|
|
}
|
|
|
|
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
|
|
offset := 0
|
|
limit := 100
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 100 {
|
|
limit = 100
|
|
}
|
|
}
|
|
|
|
// Serve from cache (updated once per minute in background)
|
|
c.statsMu.RLock()
|
|
cached := c.cachedAllDomains
|
|
c.statsMu.RUnlock()
|
|
|
|
var domains []DomainStat
|
|
if cached != nil && offset < len(cached) {
|
|
end := offset + limit
|
|
if end > len(cached) {
|
|
end = len(cached)
|
|
}
|
|
domains = cached[offset:end]
|
|
}
|
|
if domains == nil {
|
|
domains = []DomainStat{}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
// handleAPIDomains lists domains with optional status filter, including their feeds
|
|
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
|
status := r.URL.Query().Get("status")
|
|
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
|
|
search := r.URL.Query().Get("search")
|
|
tldFilter := r.URL.Query().Get("tld")
|
|
feedMode := r.URL.Query().Get("feedMode") // include or exclude
|
|
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated
|
|
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
// Parse comma-separated values
|
|
var statusList, typeList []string
|
|
if feedStatuses != "" {
|
|
statusList = strings.Split(feedStatuses, ",")
|
|
}
|
|
if feedTypes != "" {
|
|
typeList = strings.Split(feedTypes, ",")
|
|
}
|
|
|
|
// Parse search prefix for type-specific searching
|
|
var searchQuery SearchQuery
|
|
if search != "" {
|
|
searchQuery = parseSearchPrefix(search)
|
|
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
|
|
// All other searches use the literal pattern
|
|
if searchQuery.Type == "domain" {
|
|
hostPart, detectedTLD := parseSearchTerm(searchQuery.Pattern)
|
|
if detectedTLD != "" {
|
|
searchQuery.Pattern = hostPart
|
|
searchQuery.ExactMatch = true // d:npr.org matches exactly npr.org
|
|
if tldFilter == "" {
|
|
tldFilter = detectedTLD
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// First get domains
|
|
var rows pgx.Rows
|
|
var err error
|
|
|
|
// If feed filter is specified, query domains that have matching feeds
|
|
if len(statusList) > 0 || len(typeList) > 0 || feedMode != "" {
|
|
// Build dynamic query to get domains with matching feeds
|
|
query := `
|
|
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found
|
|
FROM domains d
|
|
INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld)
|
|
WHERE 1=1`
|
|
args := []interface{}{}
|
|
argNum := 1
|
|
|
|
if tldFilter != "" {
|
|
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tldFilter)
|
|
argNum++
|
|
}
|
|
if status != "" {
|
|
query += fmt.Sprintf(" AND d.status = $%d", argNum)
|
|
args = append(args, status)
|
|
argNum++
|
|
}
|
|
|
|
// Handle status filters (publish_status for pass/skip/hold/dead)
|
|
if len(statusList) > 0 {
|
|
if feedMode == "exclude" {
|
|
query += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
|
|
} else {
|
|
query += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", argNum)
|
|
}
|
|
args = append(args, statusList)
|
|
argNum++
|
|
}
|
|
|
|
// Handle type filters (including special "empty" type)
|
|
if len(typeList) > 0 {
|
|
hasEmpty := false
|
|
var regularTypes []string
|
|
for _, t := range typeList {
|
|
if t == "empty" {
|
|
hasEmpty = true
|
|
} else {
|
|
regularTypes = append(regularTypes, t)
|
|
}
|
|
}
|
|
|
|
if feedMode == "exclude" {
|
|
// Exclude mode
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if len(regularTypes) > 0 {
|
|
query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if hasEmpty {
|
|
query += " AND f.item_count > 0"
|
|
}
|
|
} else {
|
|
// Include mode
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
query += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if len(regularTypes) > 0 {
|
|
query += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if hasEmpty {
|
|
query += " AND (f.item_count IS NULL OR f.item_count = 0)"
|
|
}
|
|
}
|
|
}
|
|
|
|
if search != "" && searchQuery.Pattern != "" {
|
|
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
|
|
switch searchQuery.Type {
|
|
case "domain":
|
|
if searchQuery.ExactMatch {
|
|
// d:npr.org -> exact match for host "npr" (tld already filtered above)
|
|
query += fmt.Sprintf(" AND LOWER(d.host) = $%d", argNum)
|
|
args = append(args, strings.ToLower(searchQuery.Pattern))
|
|
} else {
|
|
// d:npr -> pattern match
|
|
query += fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
}
|
|
argNum++
|
|
case "url":
|
|
query += fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
argNum++
|
|
case "title":
|
|
query += fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
argNum++
|
|
case "description":
|
|
query += fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
argNum++
|
|
case "item":
|
|
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum)
|
|
args = append(args, searchPattern)
|
|
argNum++
|
|
default:
|
|
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
|
|
// Also include exact domain match if pattern looks like a domain
|
|
if searchQuery.DomainHost != "" && searchQuery.DomainTLD != "" {
|
|
query += fmt.Sprintf(` AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d OR
|
|
(LOWER(d.host) = $%d AND LOWER(d.tld) = $%d)
|
|
)`, argNum, argNum, argNum, argNum, argNum+1, argNum+2)
|
|
args = append(args, searchPattern, strings.ToLower(searchQuery.DomainHost), strings.ToLower(searchQuery.DomainTLD))
|
|
argNum += 3
|
|
} else {
|
|
query += fmt.Sprintf(` AND (
|
|
LOWER(d.host) LIKE $%d OR
|
|
LOWER(f.url) LIKE $%d OR
|
|
LOWER(f.title) LIKE $%d OR
|
|
LOWER(f.description) LIKE $%d
|
|
)`, argNum, argNum, argNum, argNum)
|
|
args = append(args, searchPattern)
|
|
argNum++
|
|
}
|
|
}
|
|
}
|
|
query += fmt.Sprintf(" ORDER BY d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
|
args = append(args, limit, offset)
|
|
|
|
rows, err = c.db.Query(query, args...)
|
|
} else if hasFeeds {
|
|
// Only domains with feeds
|
|
searchPattern := "%" + strings.ToLower(search) + "%"
|
|
if tldFilter != "" && status != "" {
|
|
// Filter by specific TLD and status
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON (d.host || '.' || d.tld) = f.source_host
|
|
WHERE d.tld = $1 AND d.status = $2
|
|
ORDER BY d.host ASC
|
|
LIMIT $3 OFFSET $4
|
|
`, tldFilter, status, limit, offset)
|
|
} else if tldFilter != "" {
|
|
// Filter by specific TLD only (exclude 'skip' by default)
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON (d.host || '.' || d.tld) = f.source_host
|
|
WHERE d.status != 'skip' AND d.tld = $1
|
|
ORDER BY d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, tldFilter, limit, offset)
|
|
} else if search != "" {
|
|
// Search in domain host only (uses trigram index)
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON (d.host || '.' || d.tld) = f.source_host
|
|
WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, searchPattern, limit, offset)
|
|
} else if status != "" {
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON (d.host || '.' || d.tld) = f.source_host
|
|
WHERE d.status = $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
} else {
|
|
// Default: exclude 'skip' status domains
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON (d.host || '.' || d.tld) = f.source_host
|
|
WHERE d.status != 'skip'
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $1 OFFSET $2
|
|
`, limit, offset)
|
|
}
|
|
} else if tldFilter != "" && search != "" && status != "" {
|
|
// Filter by TLD, status, and search
|
|
if searchQuery.ExactMatch {
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1 AND status = $2 AND LOWER(host) = $3
|
|
ORDER BY host ASC
|
|
LIMIT $4 OFFSET $5
|
|
`, tldFilter, status, strings.ToLower(searchQuery.Pattern), limit, offset)
|
|
} else {
|
|
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1 AND status = $2 AND LOWER(host) LIKE $3
|
|
ORDER BY host ASC
|
|
LIMIT $4 OFFSET $5
|
|
`, tldFilter, status, searchPattern, limit, offset)
|
|
}
|
|
} else if tldFilter != "" && search != "" {
|
|
// Filter by TLD and search
|
|
if searchQuery.ExactMatch {
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1 AND LOWER(host) = $2
|
|
ORDER BY host ASC
|
|
LIMIT $3 OFFSET $4
|
|
`, tldFilter, strings.ToLower(searchQuery.Pattern), limit, offset)
|
|
} else {
|
|
searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%"
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1 AND LOWER(host) LIKE $2
|
|
ORDER BY host ASC
|
|
LIMIT $3 OFFSET $4
|
|
`, tldFilter, searchPattern, limit, offset)
|
|
}
|
|
} else if tldFilter != "" && status != "" {
|
|
// Filter by TLD and status
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1 AND status = $2
|
|
ORDER BY host ASC
|
|
LIMIT $3 OFFSET $4
|
|
`, tldFilter, status, limit, offset)
|
|
} else if tldFilter != "" {
|
|
// Filter by TLD only (show all statuses)
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1
|
|
ORDER BY host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, tldFilter, limit, offset)
|
|
} else if status != "" {
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE status = $1
|
|
ORDER BY tld ASC, host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
} else {
|
|
// Default: exclude 'skip' status domains
|
|
rows, err = c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE status != 'skip'
|
|
ORDER BY tld ASC, host ASC
|
|
LIMIT $1 OFFSET $2
|
|
`, limit, offset)
|
|
}
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type FeedInfo struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title,omitempty"`
|
|
Type string `json:"type,omitempty"`
|
|
Status string `json:"status,omitempty"`
|
|
PublishStatus string `json:"publish_status,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
ItemCount int `json:"item_count,omitempty"`
|
|
}
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
Feeds []FeedInfo `json:"feeds,omitempty"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
var hosts []string
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tld, lastError *string
|
|
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tld)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
// Build full domain for feed lookup (source_host = host.tld)
|
|
fullDomain := d.Host
|
|
if d.TLD != "" {
|
|
fullDomain = d.Host + "." + d.TLD
|
|
}
|
|
hosts = append(hosts, fullDomain)
|
|
}
|
|
|
|
// Now get feeds for these domains (with actual item count from items table)
|
|
// Apply the same feed filters used for domain selection
|
|
if len(hosts) > 0 {
|
|
feedQuery := `
|
|
SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
|
|
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
|
|
FROM feeds f
|
|
WHERE f.source_host = ANY($1)`
|
|
feedArgs := []interface{}{hosts}
|
|
feedArgNum := 2
|
|
|
|
// Apply feed status filters (publish_status for pass/skip/hold/dead)
|
|
if len(statusList) > 0 {
|
|
if feedMode == "exclude" {
|
|
feedQuery += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
|
|
} else {
|
|
feedQuery += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", feedArgNum)
|
|
}
|
|
feedArgs = append(feedArgs, statusList)
|
|
feedArgNum++
|
|
}
|
|
|
|
// Apply feed type filters (including special "empty" type)
|
|
if len(typeList) > 0 {
|
|
hasEmpty := false
|
|
var regularTypes []string
|
|
for _, t := range typeList {
|
|
if t == "empty" {
|
|
hasEmpty = true
|
|
} else {
|
|
regularTypes = append(regularTypes, t)
|
|
}
|
|
}
|
|
|
|
if feedMode == "exclude" {
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", feedArgNum)
|
|
feedArgs = append(feedArgs, regularTypes)
|
|
feedArgNum++
|
|
} else if len(regularTypes) > 0 {
|
|
feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", feedArgNum)
|
|
feedArgs = append(feedArgs, regularTypes)
|
|
feedArgNum++
|
|
} else if hasEmpty {
|
|
feedQuery += " AND f.item_count > 0"
|
|
}
|
|
} else {
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
feedQuery += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", feedArgNum)
|
|
feedArgs = append(feedArgs, regularTypes)
|
|
feedArgNum++
|
|
} else if len(regularTypes) > 0 {
|
|
feedQuery += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", feedArgNum)
|
|
feedArgs = append(feedArgs, regularTypes)
|
|
feedArgNum++
|
|
} else if hasEmpty {
|
|
feedQuery += " AND (f.item_count IS NULL OR f.item_count = 0)"
|
|
}
|
|
}
|
|
}
|
|
|
|
feedQuery += " ORDER BY f.source_host, f.url"
|
|
|
|
feedRows, err := c.db.Query(feedQuery, feedArgs...)
|
|
if err == nil {
|
|
defer feedRows.Close()
|
|
feedsByHost := make(map[string][]FeedInfo)
|
|
for feedRows.Next() {
|
|
var host string
|
|
var f FeedInfo
|
|
var title, feedType, status, publishStatus, language *string
|
|
var itemCount *int
|
|
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus, &language, &itemCount); err != nil {
|
|
continue
|
|
}
|
|
f.Title = StringValue(title)
|
|
f.Type = StringValue(feedType)
|
|
f.Status = StringValue(status)
|
|
f.PublishStatus = StringValue(publishStatus)
|
|
f.Language = StringValue(language)
|
|
if itemCount != nil {
|
|
f.ItemCount = *itemCount
|
|
}
|
|
feedsByHost[host] = append(feedsByHost[host], f)
|
|
}
|
|
// Attach feeds to domains (feedsByHost is keyed by full domain)
|
|
for i := range domains {
|
|
fullHost := domains[i].Host
|
|
if domains[i].TLD != "" {
|
|
fullHost = domains[i].Host + "." + domains[i].TLD
|
|
}
|
|
if feeds, ok := feedsByHost[fullHost]; ok {
|
|
domains[i].Feeds = feeds
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
|
|
status := r.URL.Query().Get("status")
|
|
if status == "" {
|
|
http.Error(w, "status parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE status = $1
|
|
ORDER BY tld ASC, host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tld, lastError *string
|
|
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tld)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT url, title, type, status, last_error, item_count, publish_status, language
|
|
FROM feeds
|
|
WHERE source_host = $1
|
|
ORDER BY url ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, host, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type FeedInfo struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
Type string `json:"type"`
|
|
Status string `json:"status,omitempty"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
ItemCount int `json:"item_count,omitempty"`
|
|
PublishStatus string `json:"publish_status,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
}
|
|
|
|
var feeds []FeedInfo
|
|
for rows.Next() {
|
|
var f FeedInfo
|
|
var title, status, lastError, publishStatus, language *string
|
|
var itemCount *int
|
|
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil {
|
|
continue
|
|
}
|
|
f.Title = StringValue(title)
|
|
f.Status = StringValue(status)
|
|
f.LastError = StringValue(lastError)
|
|
f.PublishStatus = StringValue(publishStatus)
|
|
f.Language = StringValue(language)
|
|
if itemCount != nil {
|
|
f.ItemCount = *itemCount
|
|
}
|
|
feeds = append(feeds, f)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(feeds)
|
|
}
|
|
|
|
// handleAPISetDomainStatus sets the status for a domain
|
|
// status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop')
|
|
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
status := r.URL.Query().Get("status")
|
|
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
if status != "hold" && status != "pass" && status != "skip" {
|
|
http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
host = normalizeHost(host)
|
|
|
|
// Setting to 'skip' triggers takedown (hide content but preserve data)
|
|
if status == "skip" {
|
|
result := c.skipDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
return
|
|
}
|
|
|
|
// When setting to pass, clear any last_error
|
|
var err error
|
|
strippedHost := stripTLD(host)
|
|
tld := getTLD(host)
|
|
if status == "pass" {
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = $1, last_error = NULL
|
|
WHERE host = $2 AND tld = $3
|
|
`, status, strippedHost, tld)
|
|
} else {
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = $1
|
|
WHERE host = $2 AND tld = $3
|
|
`, status, strippedHost, tld)
|
|
}
|
|
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]string{
|
|
"host": host,
|
|
"status": status,
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
UPDATE domains SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
|
|
WHERE host = $1 AND tld = $2
|
|
`, stripTLD(host), getTLD(host))
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
|
|
}
|
|
|
|
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
|
|
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
host = normalizeHost(host)
|
|
|
|
// Add domain if it doesn't exist, or reset to pass for crawling
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO domains (host, status, tld)
|
|
VALUES ($1, 'pass', $2)
|
|
ON CONFLICT(host, tld) DO UPDATE SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL
|
|
`, stripTLD(host), getTLD(host))
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// Crawl synchronously
|
|
fmt.Printf("Priority crawl: %s\n", host)
|
|
feedsFound, crawlErr := c.feedCrawl(host)
|
|
|
|
errStr := ""
|
|
if crawlErr != nil {
|
|
errStr = crawlErr.Error()
|
|
}
|
|
|
|
// Mark as crawled
|
|
c.markDomainCrawled(stripTLD(host), getTLD(host), feedsFound, errStr)
|
|
|
|
// Get the feeds we found
|
|
feeds, _ := c.GetFeedsByHost(host)
|
|
|
|
type FeedSummary struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
Type string `json:"type"`
|
|
Category string `json:"category"`
|
|
Status string `json:"status"`
|
|
}
|
|
var feedSummaries []FeedSummary
|
|
for _, f := range feeds {
|
|
feedSummaries = append(feedSummaries, FeedSummary{
|
|
URL: f.URL,
|
|
Title: f.Title,
|
|
Type: f.Type,
|
|
Category: f.Category,
|
|
Status: f.Status,
|
|
})
|
|
}
|
|
|
|
result := map[string]interface{}{
|
|
"host": host,
|
|
"feeds_found": feedsFound,
|
|
"feeds": feedSummaries,
|
|
}
|
|
if crawlErr != nil {
|
|
result["error"] = crawlErr.Error()
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// handleAPIFilter handles flexible filtering with stackable parameters
|
|
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
domain := r.URL.Query().Get("domain")
|
|
feedStatus := r.URL.Query().Get("feedStatus")
|
|
domainStatus := r.URL.Query().Get("domainStatus")
|
|
languages := r.URL.Query().Get("languages") // comma-separated list
|
|
show := r.URL.Query().Get("show") // "feeds" or "domains"
|
|
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
// Parse languages into slice
|
|
var langList []string
|
|
if languages != "" {
|
|
for _, lang := range strings.Split(languages, ",") {
|
|
lang = strings.TrimSpace(lang)
|
|
if lang != "" {
|
|
langList = append(langList, lang)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine what to show based on filters
|
|
if show == "" {
|
|
if feedStatus != "" || domain != "" || len(langList) > 0 {
|
|
show = "feeds"
|
|
} else {
|
|
show = "domains"
|
|
}
|
|
}
|
|
|
|
if show == "feeds" {
|
|
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
|
|
} else {
|
|
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
|
|
var args []interface{}
|
|
argNum := 1
|
|
query := `
|
|
SELECT host, tld, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE 1=1`
|
|
|
|
if tld != "" {
|
|
query += fmt.Sprintf(" AND tld = $%d", argNum)
|
|
args = append(args, tld)
|
|
argNum++
|
|
}
|
|
if status != "" {
|
|
query += fmt.Sprintf(" AND status = $%d", argNum)
|
|
args = append(args, status)
|
|
argNum++
|
|
}
|
|
|
|
// Sort by feed count descending or alphabetically
|
|
if sort == "feeds" {
|
|
query += fmt.Sprintf(" ORDER BY feeds_found DESC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
|
} else {
|
|
query += fmt.Sprintf(" ORDER BY tld ASC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
|
}
|
|
args = append(args, limit, offset)
|
|
|
|
rows, err := c.db.Query(query, args...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tldVal, lastError *string
|
|
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tldVal)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"type": "domains",
|
|
"data": domains,
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
if tld == "" {
|
|
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT host, status, last_error, feeds_found
|
|
FROM domains
|
|
WHERE tld = $1
|
|
ORDER BY host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, tld, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var lastError *string
|
|
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
|
status := r.URL.Query().Get("status") // domain status: pass, skip, hold, dead
|
|
feedMode := r.URL.Query().Get("feedMode") // include or exclude
|
|
feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated: pass,skip,hold,dead
|
|
feedTypes := r.URL.Query().Get("feedTypes") // comma-separated: rss,atom,json,unknown,empty
|
|
search := r.URL.Query().Get("search") // search query
|
|
|
|
// Parse comma-separated values
|
|
var statusList, typeList []string
|
|
if feedStatuses != "" {
|
|
statusList = strings.Split(feedStatuses, ",")
|
|
}
|
|
if feedTypes != "" {
|
|
typeList = strings.Split(feedTypes, ",")
|
|
}
|
|
|
|
var rows pgx.Rows
|
|
var err error
|
|
|
|
// If feed filter is specified, query from feeds table instead
|
|
if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" {
|
|
// Build query to get TLDs from feeds
|
|
query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL`
|
|
args := []interface{}{}
|
|
argNum := 1
|
|
|
|
// Handle status filters (publish_status for pass/skip/hold/dead)
|
|
if len(statusList) > 0 {
|
|
if feedMode == "exclude" {
|
|
query += fmt.Sprintf(" AND (publish_status IS NULL OR publish_status NOT IN (SELECT unnest($%d::text[])))", argNum)
|
|
} else {
|
|
query += fmt.Sprintf(" AND publish_status IN (SELECT unnest($%d::text[]))", argNum)
|
|
}
|
|
args = append(args, statusList)
|
|
argNum++
|
|
}
|
|
|
|
// Handle type filters (including special "empty" type)
|
|
if len(typeList) > 0 {
|
|
hasEmpty := false
|
|
var regularTypes []string
|
|
for _, t := range typeList {
|
|
if t == "empty" {
|
|
hasEmpty = true
|
|
} else {
|
|
regularTypes = append(regularTypes, t)
|
|
}
|
|
}
|
|
|
|
if feedMode == "exclude" {
|
|
// Exclude mode: exclude these types
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
query += fmt.Sprintf(" AND type NOT IN (SELECT unnest($%d::text[])) AND item_count > 0", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if len(regularTypes) > 0 {
|
|
query += fmt.Sprintf(" AND (type IS NULL OR type NOT IN (SELECT unnest($%d::text[])))", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if hasEmpty {
|
|
query += " AND item_count > 0"
|
|
}
|
|
} else {
|
|
// Include mode: include these types
|
|
if len(regularTypes) > 0 && hasEmpty {
|
|
query += fmt.Sprintf(" AND (type IN (SELECT unnest($%d::text[])) OR item_count IS NULL OR item_count = 0)", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if len(regularTypes) > 0 {
|
|
query += fmt.Sprintf(" AND type IN (SELECT unnest($%d::text[]))", argNum)
|
|
args = append(args, regularTypes)
|
|
argNum++
|
|
} else if hasEmpty {
|
|
query += " AND (item_count IS NULL OR item_count = 0)"
|
|
}
|
|
}
|
|
}
|
|
|
|
if search != "" {
|
|
sq := parseSearchPrefix(search)
|
|
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
|
|
|
|
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
|
|
var tldFilter string
|
|
var exactMatch bool
|
|
hostSearchPattern := searchPattern
|
|
if sq.Type == "domain" {
|
|
hostPattern, detectedTLD := parseSearchTerm(sq.Pattern)
|
|
if detectedTLD != "" {
|
|
tldFilter = detectedTLD
|
|
exactMatch = true
|
|
hostSearchPattern = "%" + strings.ToLower(hostPattern) + "%"
|
|
}
|
|
}
|
|
|
|
switch sq.Type {
|
|
case "domain":
|
|
// Search domain names
|
|
if exactMatch && tldFilter != "" {
|
|
// d:npr.org -> exact match (source_host = 'npr.org')
|
|
query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum)
|
|
args = append(args, strings.ToLower(sq.Pattern))
|
|
} else if tldFilter != "" {
|
|
query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1)
|
|
args = append(args, tldFilter, hostSearchPattern)
|
|
} else {
|
|
query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum)
|
|
args = append(args, hostSearchPattern)
|
|
}
|
|
case "url":
|
|
query += fmt.Sprintf(" AND LOWER(url) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
case "title":
|
|
query += fmt.Sprintf(" AND LOWER(title) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
case "description":
|
|
query += fmt.Sprintf(" AND LOWER(description) LIKE $%d", argNum)
|
|
args = append(args, searchPattern)
|
|
case "item":
|
|
query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = feeds.url AND LOWER(i.title) LIKE $%d)", argNum)
|
|
args = append(args, searchPattern)
|
|
default:
|
|
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
|
|
// Also include exact domain match if pattern looks like a domain
|
|
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
|
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
|
|
query += fmt.Sprintf(` AND (
|
|
LOWER(source_host) LIKE $%d OR
|
|
LOWER(url) LIKE $%d OR
|
|
LOWER(title) LIKE $%d OR
|
|
LOWER(description) LIKE $%d OR
|
|
LOWER(source_host) = $%d
|
|
)`, argNum, argNum, argNum, argNum, argNum+1)
|
|
args = append(args, searchPattern, fullDomain)
|
|
} else {
|
|
query += fmt.Sprintf(` AND (
|
|
LOWER(source_host) LIKE $%d OR
|
|
LOWER(url) LIKE $%d OR
|
|
LOWER(title) LIKE $%d OR
|
|
LOWER(description) LIKE $%d
|
|
)`, argNum, argNum, argNum, argNum)
|
|
args = append(args, searchPattern)
|
|
}
|
|
}
|
|
}
|
|
query += " GROUP BY tld ORDER BY tld ASC"
|
|
rows, err = c.db.Query(query, args...)
|
|
} else if search != "" {
|
|
// Parse search prefix for type-specific searching
|
|
sq := parseSearchPrefix(search)
|
|
|
|
// Use the helper to build the TLD search query
|
|
query, args := buildTLDSearchQuery(sq)
|
|
rows, err = c.db.Query(query, args...)
|
|
} else if status != "" {
|
|
// TLDs filtered by domain status
|
|
rows, err = c.db.Query(`
|
|
SELECT tld::text as tld, COUNT(*) as domain_count
|
|
FROM domains
|
|
WHERE tld IS NOT NULL AND status = $1
|
|
GROUP BY tld
|
|
HAVING COUNT(*) > 0
|
|
ORDER BY tld ASC
|
|
`, status)
|
|
} else {
|
|
// All TLDs from enum with domain counts
|
|
rows, err = c.db.Query(`
|
|
SELECT e.enumlabel as tld, COALESCE(d.cnt, 0) as domain_count
|
|
FROM pg_enum e
|
|
LEFT JOIN (
|
|
SELECT tld::text as tld, COUNT(*) as cnt
|
|
FROM domains
|
|
GROUP BY tld
|
|
) d ON e.enumlabel = d.tld
|
|
WHERE e.enumtypid = 'tld_enum'::regtype
|
|
ORDER BY e.enumlabel ASC
|
|
`)
|
|
}
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type TLDInfo struct {
|
|
TLD string `json:"tld"`
|
|
DomainCount int `json:"domain_count"`
|
|
}
|
|
|
|
var tlds []TLDInfo
|
|
for rows.Next() {
|
|
var t TLDInfo
|
|
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
|
|
continue
|
|
}
|
|
tlds = append(tlds, t)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(tlds)
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
if tld == "" {
|
|
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
search := r.URL.Query().Get("search")
|
|
|
|
stats := map[string]interface{}{
|
|
"tld": tld,
|
|
}
|
|
|
|
// Build WHERE clause based on whether search is provided
|
|
var domainWhere, feedWhere string
|
|
var domainArgs, feedArgs []interface{}
|
|
|
|
if search != "" {
|
|
// Parse search prefix for type-specific searching
|
|
sq := parseSearchPrefix(search)
|
|
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
|
|
|
|
// For domain searches, check for exact match
|
|
if sq.Type == "domain" {
|
|
hostPart, detectedTLD := parseSearchTerm(sq.Pattern)
|
|
if detectedTLD != "" {
|
|
// d:npr.org -> exact match for host "npr" in specified TLD
|
|
domainWhere = "tld = $1 AND lower(host) = $2"
|
|
domainArgs = []interface{}{tld, strings.ToLower(hostPart)}
|
|
feedWhere = "tld = $1 AND lower(source_host) = $2"
|
|
feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)}
|
|
} else {
|
|
// d:npr -> pattern match in specified TLD
|
|
domainWhere = "tld = $1 AND lower(host) LIKE $2"
|
|
domainArgs = []interface{}{tld, searchPattern}
|
|
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
|
|
feedArgs = []interface{}{tld, searchPattern}
|
|
}
|
|
} else {
|
|
// Other search types - pattern match
|
|
domainWhere = "tld = $1 AND lower(host) LIKE $2"
|
|
domainArgs = []interface{}{tld, searchPattern}
|
|
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
|
|
feedArgs = []interface{}{tld, searchPattern}
|
|
}
|
|
stats["search"] = search
|
|
} else {
|
|
// Filter by TLD only
|
|
domainWhere = "tld = $1"
|
|
domainArgs = []interface{}{tld}
|
|
feedWhere = "tld = $1"
|
|
feedArgs = []interface{}{tld}
|
|
}
|
|
|
|
// Domain stats by status
|
|
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
|
|
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE `+domainWhere, domainArgs...).Scan(&totalDomains)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
stats["total_domains"] = totalDomains
|
|
|
|
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for rows.Next() {
|
|
var status string
|
|
var count int
|
|
if err := rows.Scan(&status, &count); err != nil {
|
|
continue
|
|
}
|
|
switch status {
|
|
case "pass":
|
|
passDomains = count
|
|
case "skip":
|
|
skipDomains = count
|
|
case "hold":
|
|
holdDomains = count
|
|
case "dead":
|
|
deadDomains = count
|
|
}
|
|
}
|
|
rows.Close()
|
|
stats["pass_domains"] = passDomains
|
|
stats["skip_domains"] = skipDomains
|
|
stats["hold_domains"] = holdDomains
|
|
stats["dead_domains"] = deadDomains
|
|
|
|
// Feed stats
|
|
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
|
|
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
|
|
|
|
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE `+feedWhere, feedArgs...).Scan(&totalFeeds)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
stats["total_feeds"] = totalFeeds
|
|
|
|
// Feed status counts
|
|
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for statusRows.Next() {
|
|
var status string
|
|
var count int
|
|
if err := statusRows.Scan(&status, &count); err != nil {
|
|
continue
|
|
}
|
|
switch status {
|
|
case "pass":
|
|
passFeeds = count
|
|
case "skip":
|
|
skipFeeds = count
|
|
case "hold":
|
|
holdFeeds = count
|
|
case "dead":
|
|
deadFeeds = count
|
|
}
|
|
}
|
|
statusRows.Close()
|
|
stats["pass_feeds"] = passFeeds
|
|
stats["skip_feeds"] = skipFeeds
|
|
stats["hold_feeds"] = holdFeeds
|
|
stats["dead_feeds"] = deadFeeds
|
|
|
|
// Empty feeds count
|
|
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
|
|
stats["empty_feeds"] = emptyFeeds
|
|
|
|
// Feed type counts
|
|
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for typeRows.Next() {
|
|
var feedType string
|
|
var count int
|
|
if err := typeRows.Scan(&feedType, &count); err != nil {
|
|
continue
|
|
}
|
|
switch feedType {
|
|
case "rss":
|
|
rssFeeds = count
|
|
case "atom":
|
|
atomFeeds = count
|
|
case "json":
|
|
jsonFeeds = count
|
|
default:
|
|
unknownFeeds += count
|
|
}
|
|
}
|
|
typeRows.Close()
|
|
stats["rss_feeds"] = rssFeeds
|
|
stats["atom_feeds"] = atomFeeds
|
|
stats["json_feeds"] = jsonFeeds
|
|
stats["unknown_feeds"] = unknownFeeds
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(stats)
|
|
}
|
|
|
|
func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
|
|
search := r.URL.Query().Get("search")
|
|
if search == "" {
|
|
http.Error(w, "search parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Parse search prefix for type-specific searching
|
|
sq := parseSearchPrefix(search)
|
|
searchPattern := "%" + strings.ToLower(sq.Pattern) + "%"
|
|
|
|
// Only extract TLD for domain searches (d:npr.org -> exact match for npr.org)
|
|
var tldFilter, hostPart string
|
|
var exactMatch bool
|
|
if sq.Type == "domain" {
|
|
hostPart, tldFilter = parseSearchTerm(sq.Pattern)
|
|
if tldFilter != "" {
|
|
searchPattern = "%" + strings.ToLower(hostPart) + "%"
|
|
exactMatch = true
|
|
}
|
|
}
|
|
|
|
stats := map[string]interface{}{}
|
|
|
|
// Build WHERE clause based on search type
|
|
var domainWhere, feedWhere string
|
|
var domainArgs, feedArgs []interface{}
|
|
|
|
switch sq.Type {
|
|
case "domain":
|
|
if exactMatch && tldFilter != "" {
|
|
// d:npr.org -> exact match
|
|
domainWhere = "tld = $1 AND LOWER(host) = $2"
|
|
domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)}
|
|
feedWhere = "LOWER(source_host) = $1"
|
|
feedArgs = []interface{}{strings.ToLower(sq.Pattern)}
|
|
} else if tldFilter != "" {
|
|
domainWhere = "tld = $1 AND LOWER(host) LIKE $2"
|
|
domainArgs = []interface{}{tldFilter, searchPattern}
|
|
feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2"
|
|
feedArgs = []interface{}{tldFilter, searchPattern}
|
|
} else {
|
|
domainWhere = "LOWER(host) LIKE $1"
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = "LOWER(source_host) LIKE $1"
|
|
feedArgs = []interface{}{searchPattern}
|
|
}
|
|
case "url":
|
|
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)"
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = "LOWER(url) LIKE $1"
|
|
feedArgs = []interface{}{searchPattern}
|
|
case "title":
|
|
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)"
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = "LOWER(title) LIKE $1"
|
|
feedArgs = []interface{}{searchPattern}
|
|
case "description":
|
|
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)"
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = "LOWER(description) LIKE $1"
|
|
feedArgs = []interface{}{searchPattern}
|
|
case "item":
|
|
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)"
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)"
|
|
feedArgs = []interface{}{searchPattern}
|
|
default:
|
|
// "all" - search domains and feeds (NOT items - use i: prefix for item search)
|
|
// Also include exact domain match if pattern looks like a domain
|
|
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
|
domainWhere = `(
|
|
LOWER(host) LIKE $1 OR
|
|
(LOWER(host) = $2 AND LOWER(tld) = $3) OR
|
|
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
|
|
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
|
|
))
|
|
)`
|
|
domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
|
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
|
|
feedWhere = `(
|
|
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2
|
|
)`
|
|
feedArgs = []interface{}{searchPattern, fullDomain}
|
|
} else {
|
|
domainWhere = `(
|
|
LOWER(host) LIKE $1 OR
|
|
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
|
|
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
|
|
))
|
|
)`
|
|
domainArgs = []interface{}{searchPattern}
|
|
feedWhere = `(
|
|
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
|
|
)`
|
|
feedArgs = []interface{}{searchPattern}
|
|
}
|
|
}
|
|
|
|
// Count matching domains by status
|
|
var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int
|
|
rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for rows.Next() {
|
|
var status string
|
|
var count int
|
|
if err := rows.Scan(&status, &count); err != nil {
|
|
continue
|
|
}
|
|
totalDomains += count
|
|
switch status {
|
|
case "pass":
|
|
passDomains = count
|
|
case "skip":
|
|
skipDomains = count
|
|
case "hold":
|
|
holdDomains = count
|
|
case "dead":
|
|
deadDomains = count
|
|
}
|
|
}
|
|
rows.Close()
|
|
stats["total_domains"] = totalDomains
|
|
stats["pass_domains"] = passDomains
|
|
stats["skip_domains"] = skipDomains
|
|
stats["hold_domains"] = holdDomains
|
|
stats["dead_domains"] = deadDomains
|
|
|
|
// Count matching feeds by status
|
|
var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int
|
|
var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int
|
|
|
|
statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for statusRows.Next() {
|
|
var status string
|
|
var count int
|
|
if err := statusRows.Scan(&status, &count); err != nil {
|
|
continue
|
|
}
|
|
totalFeeds += count
|
|
switch status {
|
|
case "pass":
|
|
passFeeds = count
|
|
case "skip":
|
|
skipFeeds = count
|
|
case "hold":
|
|
holdFeeds = count
|
|
case "dead":
|
|
deadFeeds = count
|
|
}
|
|
}
|
|
statusRows.Close()
|
|
stats["total_feeds"] = totalFeeds
|
|
stats["pass_feeds"] = passFeeds
|
|
stats["skip_feeds"] = skipFeeds
|
|
stats["hold_feeds"] = holdFeeds
|
|
stats["dead_feeds"] = deadFeeds
|
|
|
|
// Count empty feeds
|
|
c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds)
|
|
stats["empty_feeds"] = emptyFeeds
|
|
|
|
typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for typeRows.Next() {
|
|
var feedType string
|
|
var count int
|
|
if err := typeRows.Scan(&feedType, &count); err != nil {
|
|
continue
|
|
}
|
|
switch feedType {
|
|
case "rss":
|
|
rssFeeds = count
|
|
case "atom":
|
|
atomFeeds = count
|
|
case "json":
|
|
jsonFeeds = count
|
|
default:
|
|
unknownFeeds += count
|
|
}
|
|
}
|
|
typeRows.Close()
|
|
stats["rss_feeds"] = rssFeeds
|
|
stats["atom_feeds"] = atomFeeds
|
|
stats["json_feeds"] = jsonFeeds
|
|
stats["unknown_feeds"] = unknownFeeds
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(stats)
|
|
}
|
|
|
|
// handleAPIDenyDomain skips a domain (takedown accounts, preserve data)
|
|
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.skipDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// DomainActionResult contains the results of a domain action
|
|
type DomainActionResult struct {
|
|
Success bool `json:"success"`
|
|
Host string `json:"host"`
|
|
Action string `json:"action"`
|
|
FeedsAffected int64 `json:"feeds_affected,omitempty"`
|
|
ItemsDeleted int64 `json:"items_deleted,omitempty"`
|
|
AccountsAffected int `json:"accounts_affected,omitempty"`
|
|
AccountErrors []string `json:"account_errors,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// getPDSCredentials loads PDS credentials from environment or pds.env file
|
|
func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
|
|
pdsHost = os.Getenv("PDS_HOST")
|
|
pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD")
|
|
if pdsHost == "" || pdsAdminPassword == "" {
|
|
if file, err := os.Open("pds.env"); err == nil {
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if strings.HasPrefix(line, "PDS_HOST=") {
|
|
pdsHost = strings.TrimPrefix(line, "PDS_HOST=")
|
|
} else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") {
|
|
pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=")
|
|
}
|
|
}
|
|
file.Close()
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
|
|
func (c *Crawler) getDomainDIDs(host string) []string {
|
|
var dids []string
|
|
rows, err := c.db.Query(`
|
|
SELECT DISTINCT publish_account FROM feeds
|
|
WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != ''
|
|
`, host)
|
|
if err == nil {
|
|
defer rows.Close()
|
|
for rows.Next() {
|
|
var did string
|
|
if err := rows.Scan(&did); err == nil && did != "" {
|
|
dids = append(dids, did)
|
|
}
|
|
}
|
|
}
|
|
return dids
|
|
}
|
|
|
|
// skipDomain sets a domain to skip, takes down PDS accounts but preserves all data
|
|
func (c *Crawler) skipDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "skip"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Takedown PDS accounts (hide content but preserve data)
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark feeds as skipped (but don't delete)
|
|
feedsAffected, err := c.db.Exec(`
|
|
UPDATE feeds SET status = 'skip', publish_status = 'skip'
|
|
WHERE source_host = $1
|
|
`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsAffected
|
|
|
|
// Update domain status to skip
|
|
_, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|
|
|
|
// handleAPIDropDomain permanently deletes all data for a skipped domain
|
|
func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Verify domain is currently skipped
|
|
var status string
|
|
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
|
|
if err != nil {
|
|
http.Error(w, "domain not found", http.StatusNotFound)
|
|
return
|
|
}
|
|
if status != "skip" {
|
|
http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.dropDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts)
|
|
func (c *Crawler) dropDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "drop"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Delete PDS accounts
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get feed URLs for this domain (needed to delete items)
|
|
var feedURLs []string
|
|
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host)
|
|
if err == nil {
|
|
defer feedRows.Close()
|
|
for feedRows.Next() {
|
|
var url string
|
|
if err := feedRows.Scan(&url); err == nil {
|
|
feedURLs = append(feedURLs, url)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete items for all feeds from this domain
|
|
for _, feedURL := range feedURLs {
|
|
deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL)
|
|
if err == nil {
|
|
result.ItemsDeleted += deleted
|
|
}
|
|
}
|
|
|
|
// Delete all feeds from this domain
|
|
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsDeleted
|
|
|
|
// Update domain status to drop
|
|
_, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host))
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|
|
|
|
// handleAPIUndenyDomain removes skip status from a domain (restores accounts)
|
|
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Verify domain is currently skipped
|
|
var status string
|
|
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status)
|
|
if err != nil {
|
|
http.Error(w, "domain not found", http.StatusNotFound)
|
|
return
|
|
}
|
|
if status != "skip" {
|
|
http.Error(w, "domain is not skipped", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.restoreDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// restoreDomain removes skip status and restores PDS accounts
|
|
func (c *Crawler) restoreDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "restore"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Restore PDS accounts (remove takedown)
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Restore feeds to pass status
|
|
feedsAffected, err := c.db.Exec(`
|
|
UPDATE feeds SET status = 'pass', publish_status = 'pass'
|
|
WHERE source_host = $1
|
|
`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsAffected
|
|
|
|
// Update domain status back to pass
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = 'pass', last_error = NULL
|
|
WHERE host = $1 AND tld = $2
|
|
`, stripTLD(host), getTLD(host))
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|