package main import ( "bufio" "encoding/json" "fmt" "net/http" "os" "strings" "github.com/jackc/pgx/v5" ) // buildTLDSearchQuery builds a query to get TLDs based on search type // Returns (query, args) for the database query func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { pattern := "%" + strings.ToLower(sq.Pattern) + "%" switch sq.Type { case "domain": // Check if pattern includes TLD (e.g., d:npr.org -> exact match) hostPart, tldFilter := parseSearchTerm(sq.Pattern) if tldFilter != "" { // Exact match - return just the matching TLD return ` SELECT tld::text as tld, COUNT(*) as domain_count FROM domains WHERE tld = $1 AND LOWER(host) = $2 GROUP BY tld ORDER BY tld ASC `, []interface{}{tldFilter, strings.ToLower(hostPart)} } // Pattern match - search all TLDs return ` SELECT tld::text as tld, COUNT(*) as domain_count FROM domains WHERE LOWER(host) LIKE $1 GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern} case "url": // Search feed URL paths (after domain) return ` SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern} case "title": // Search feed titles return ` SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern} case "description": // Search feed descriptions return ` SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern} case "item": // Search item titles return ` SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1 GROUP BY f.tld ORDER BY f.tld ASC `, []interface{}{pattern} default: // "all" - search domains and feeds (NOT items - use i: prefix for item search) // Also include exact domain match if pattern looks like a domain if sq.DomainHost != "" && sq.DomainTLD != "" { return ` SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( -- Domains matching host pattern SELECT tld::text as tld, host || '.' || tld as source_host FROM domains WHERE LOWER(host) LIKE $1 UNION -- Exact domain match SELECT tld::text as tld, host || '.' || tld as source_host FROM domains WHERE LOWER(host) = $2 AND LOWER(tld) = $3 UNION -- Feeds matching URL SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 UNION -- Feeds matching title SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 UNION -- Feeds matching description SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 ) combined GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} } return ` SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( -- Domains matching host SELECT tld::text as tld, host || '.' || tld as source_host FROM domains WHERE LOWER(host) LIKE $1 UNION -- Feeds matching URL SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 UNION -- Feeds matching title SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 UNION -- Feeds matching description SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 ) combined GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern} } } // buildDomainSearchQuery builds a query to get domains based on search type // Returns (whereClause, args, argNum) to append to the base query func buildDomainSearchQuery(sq SearchQuery, tldFilter string, argNum int) (string, []interface{}, int) { pattern := "%" + strings.ToLower(sq.Pattern) + "%" var where string var args []interface{} switch sq.Type { case "domain": if sq.ExactMatch && tldFilter != "" { // d:npr.org -> exact match where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) = $%d", argNum, argNum+1) args = []interface{}{tldFilter, strings.ToLower(sq.Pattern)} argNum += 2 } else if tldFilter != "" { where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) LIKE $%d", argNum, argNum+1) args = []interface{}{tldFilter, pattern} argNum += 2 } else { where = fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum) args = []interface{}{pattern} argNum++ } case "url": where = fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum) args = []interface{}{pattern} argNum++ if tldFilter != "" { where += fmt.Sprintf(" AND d.tld = $%d", argNum) args = append(args, tldFilter) argNum++ } case "title": where = fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum) args = []interface{}{pattern} argNum++ if tldFilter != "" { where += fmt.Sprintf(" AND d.tld = $%d", argNum) args = append(args, tldFilter) argNum++ } case "description": where = fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum) args = []interface{}{pattern} argNum++ if tldFilter != "" { where += fmt.Sprintf(" AND d.tld = $%d", argNum) args = append(args, tldFilter) argNum++ } case "item": // Need to join items - handled separately where = fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum) args = []interface{}{pattern} argNum++ if tldFilter != "" { where += fmt.Sprintf(" AND d.tld = $%d", argNum) args = append(args, tldFilter) argNum++ } default: // "all" - search everything, also include exact domain match if pattern looks like a domain if tldFilter != "" { if sq.DomainHost != "" && sq.DomainTLD != "" { where = fmt.Sprintf(` AND d.tld = $%d AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d OR (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) )`, argNum, argNum+1, argNum+1, argNum+1, argNum+1, argNum+2, argNum+3) args = []interface{}{tldFilter, pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} argNum += 4 } else { where = fmt.Sprintf(` AND d.tld = $%d AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d )`, argNum, argNum+1, argNum+1, argNum+1, argNum+1) args = []interface{}{tldFilter, pattern} argNum += 2 } } else { if sq.DomainHost != "" && sq.DomainTLD != "" { where = fmt.Sprintf(` AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d OR (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) )`, argNum, argNum, argNum, argNum, argNum+1, argNum+2) args = []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} argNum += 3 } else { where = fmt.Sprintf(` AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d )`, argNum, argNum, argNum, argNum) args = []interface{}{pattern} argNum++ } } } return where, args, argNum } func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) { offset := 0 limit := 100 if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 100 { limit = 100 } } // Serve from cache (updated once per minute in background) c.statsMu.RLock() cached := c.cachedAllDomains c.statsMu.RUnlock() var domains []DomainStat if cached != nil && offset < len(cached) { end := offset + limit if end > len(cached) { end = len(cached) } domains = cached[offset:end] } if domains == nil { domains = []DomainStat{} } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(domains) } // handleAPIDomains lists domains with optional status filter, including their feeds func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { status := r.URL.Query().Get("status") hasFeeds := r.URL.Query().Get("has_feeds") == "true" search := r.URL.Query().Get("search") tldFilter := r.URL.Query().Get("tld") feedMode := r.URL.Query().Get("feedMode") // include or exclude feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated feedTypes := r.URL.Query().Get("feedTypes") // comma-separated limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 500 { limit = 500 } } if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } // Parse comma-separated values var statusList, typeList []string if feedStatuses != "" { statusList = strings.Split(feedStatuses, ",") } if feedTypes != "" { typeList = strings.Split(feedTypes, ",") } // Parse search prefix for type-specific searching var searchQuery SearchQuery if search != "" { searchQuery = parseSearchPrefix(search) // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) // All other searches use the literal pattern if searchQuery.Type == "domain" { hostPart, detectedTLD := parseSearchTerm(searchQuery.Pattern) if detectedTLD != "" { searchQuery.Pattern = hostPart searchQuery.ExactMatch = true // d:npr.org matches exactly npr.org if tldFilter == "" { tldFilter = detectedTLD } } } } // First get domains var rows pgx.Rows var err error // If feed filter is specified, query domains that have matching feeds if len(statusList) > 0 || len(typeList) > 0 || feedMode != "" { // Build dynamic query to get domains with matching feeds query := ` SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found FROM domains d INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld) WHERE 1=1` args := []interface{}{} argNum := 1 if tldFilter != "" { query += fmt.Sprintf(" AND d.tld = $%d", argNum) args = append(args, tldFilter) argNum++ } if status != "" { query += fmt.Sprintf(" AND d.status = $%d", argNum) args = append(args, status) argNum++ } // Handle status filters (publish_status for pass/skip/hold/dead) if len(statusList) > 0 { if feedMode == "exclude" { query += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", argNum) } else { query += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", argNum) } args = append(args, statusList) argNum++ } // Handle type filters (including special "empty" type) if len(typeList) > 0 { hasEmpty := false var regularTypes []string for _, t := range typeList { if t == "empty" { hasEmpty = true } else { regularTypes = append(regularTypes, t) } } if feedMode == "exclude" { // Exclude mode if len(regularTypes) > 0 && hasEmpty { query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", argNum) args = append(args, regularTypes) argNum++ } else if len(regularTypes) > 0 { query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", argNum) args = append(args, regularTypes) argNum++ } else if hasEmpty { query += " AND f.item_count > 0" } } else { // Include mode if len(regularTypes) > 0 && hasEmpty { query += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", argNum) args = append(args, regularTypes) argNum++ } else if len(regularTypes) > 0 { query += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", argNum) args = append(args, regularTypes) argNum++ } else if hasEmpty { query += " AND (f.item_count IS NULL OR f.item_count = 0)" } } } if search != "" && searchQuery.Pattern != "" { searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" switch searchQuery.Type { case "domain": if searchQuery.ExactMatch { // d:npr.org -> exact match for host "npr" (tld already filtered above) query += fmt.Sprintf(" AND LOWER(d.host) = $%d", argNum) args = append(args, strings.ToLower(searchQuery.Pattern)) } else { // d:npr -> pattern match query += fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum) args = append(args, searchPattern) } argNum++ case "url": query += fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum) args = append(args, searchPattern) argNum++ case "title": query += fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum) args = append(args, searchPattern) argNum++ case "description": query += fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum) args = append(args, searchPattern) argNum++ case "item": query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum) args = append(args, searchPattern) argNum++ default: // "all" - search domains and feeds (NOT items - use i: prefix for item search) // Also include exact domain match if pattern looks like a domain if searchQuery.DomainHost != "" && searchQuery.DomainTLD != "" { query += fmt.Sprintf(` AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d OR (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) )`, argNum, argNum, argNum, argNum, argNum+1, argNum+2) args = append(args, searchPattern, strings.ToLower(searchQuery.DomainHost), strings.ToLower(searchQuery.DomainTLD)) argNum += 3 } else { query += fmt.Sprintf(` AND ( LOWER(d.host) LIKE $%d OR LOWER(f.url) LIKE $%d OR LOWER(f.title) LIKE $%d OR LOWER(f.description) LIKE $%d )`, argNum, argNum, argNum, argNum) args = append(args, searchPattern) argNum++ } } } query += fmt.Sprintf(" ORDER BY d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) args = append(args, limit, offset) rows, err = c.db.Query(query, args...) } else if hasFeeds { // Only domains with feeds searchPattern := "%" + strings.ToLower(search) + "%" if tldFilter != "" && status != "" { // Filter by specific TLD and status rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.tld = $1 AND d.status = $2 ORDER BY d.host ASC LIMIT $3 OFFSET $4 `, tldFilter, status, limit, offset) } else if tldFilter != "" { // Filter by specific TLD only (exclude 'skip' by default) rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status != 'skip' AND d.tld = $1 ORDER BY d.host ASC LIMIT $2 OFFSET $3 `, tldFilter, limit, offset) } else if search != "" { // Search in domain host only (uses trigram index) rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 `, searchPattern, limit, offset) } else if status != "" { rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status = $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 `, status, limit, offset) } else { // Default: exclude 'skip' status domains rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status != 'skip' ORDER BY d.tld ASC, d.host ASC LIMIT $1 OFFSET $2 `, limit, offset) } } else if tldFilter != "" && search != "" && status != "" { // Filter by TLD, status, and search if searchQuery.ExactMatch { rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 AND status = $2 AND LOWER(host) = $3 ORDER BY host ASC LIMIT $4 OFFSET $5 `, tldFilter, status, strings.ToLower(searchQuery.Pattern), limit, offset) } else { searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 AND status = $2 AND LOWER(host) LIKE $3 ORDER BY host ASC LIMIT $4 OFFSET $5 `, tldFilter, status, searchPattern, limit, offset) } } else if tldFilter != "" && search != "" { // Filter by TLD and search if searchQuery.ExactMatch { rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 AND LOWER(host) = $2 ORDER BY host ASC LIMIT $3 OFFSET $4 `, tldFilter, strings.ToLower(searchQuery.Pattern), limit, offset) } else { searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 AND LOWER(host) LIKE $2 ORDER BY host ASC LIMIT $3 OFFSET $4 `, tldFilter, searchPattern, limit, offset) } } else if tldFilter != "" && status != "" { // Filter by TLD and status rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 AND status = $2 ORDER BY host ASC LIMIT $3 OFFSET $4 `, tldFilter, status, limit, offset) } else if tldFilter != "" { // Filter by TLD only (show all statuses) rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE tld = $1 ORDER BY host ASC LIMIT $2 OFFSET $3 `, tldFilter, limit, offset) } else if status != "" { rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE status = $1 ORDER BY tld ASC, host ASC LIMIT $2 OFFSET $3 `, status, limit, offset) } else { // Default: exclude 'skip' status domains rows, err = c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE status != 'skip' ORDER BY tld ASC, host ASC LIMIT $1 OFFSET $2 `, limit, offset) } if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type FeedInfo struct { URL string `json:"url"` Title string `json:"title,omitempty"` Type string `json:"type,omitempty"` Status string `json:"status,omitempty"` PublishStatus string `json:"publish_status,omitempty"` Language string `json:"language,omitempty"` ItemCount int `json:"item_count,omitempty"` } type DomainInfo struct { Host string `json:"host"` TLD string `json:"tld"` Status string `json:"status"` LastError string `json:"last_error,omitempty"` FeedCount int `json:"feed_count"` Feeds []FeedInfo `json:"feeds,omitempty"` } var domains []DomainInfo var hosts []string for rows.Next() { var d DomainInfo var tld, lastError *string if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil { continue } d.TLD = StringValue(tld) d.LastError = StringValue(lastError) domains = append(domains, d) // Build full domain for feed lookup (source_host = host.tld) fullDomain := d.Host if d.TLD != "" { fullDomain = d.Host + "." + d.TLD } hosts = append(hosts, fullDomain) } // Now get feeds for these domains (with actual item count from items table) // Apply the same feed filters used for domain selection if len(hosts) > 0 { feedQuery := ` SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language, (SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count FROM feeds f WHERE f.source_host = ANY($1)` feedArgs := []interface{}{hosts} feedArgNum := 2 // Apply feed status filters (publish_status for pass/skip/hold/dead) if len(statusList) > 0 { if feedMode == "exclude" { feedQuery += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", feedArgNum) } else { feedQuery += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", feedArgNum) } feedArgs = append(feedArgs, statusList) feedArgNum++ } // Apply feed type filters (including special "empty" type) if len(typeList) > 0 { hasEmpty := false var regularTypes []string for _, t := range typeList { if t == "empty" { hasEmpty = true } else { regularTypes = append(regularTypes, t) } } if feedMode == "exclude" { if len(regularTypes) > 0 && hasEmpty { feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", feedArgNum) feedArgs = append(feedArgs, regularTypes) feedArgNum++ } else if len(regularTypes) > 0 { feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", feedArgNum) feedArgs = append(feedArgs, regularTypes) feedArgNum++ } else if hasEmpty { feedQuery += " AND f.item_count > 0" } } else { if len(regularTypes) > 0 && hasEmpty { feedQuery += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", feedArgNum) feedArgs = append(feedArgs, regularTypes) feedArgNum++ } else if len(regularTypes) > 0 { feedQuery += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", feedArgNum) feedArgs = append(feedArgs, regularTypes) feedArgNum++ } else if hasEmpty { feedQuery += " AND (f.item_count IS NULL OR f.item_count = 0)" } } } feedQuery += " ORDER BY f.source_host, f.url" feedRows, err := c.db.Query(feedQuery, feedArgs...) if err == nil { defer feedRows.Close() feedsByHost := make(map[string][]FeedInfo) for feedRows.Next() { var host string var f FeedInfo var title, feedType, status, publishStatus, language *string var itemCount *int if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus, &language, &itemCount); err != nil { continue } f.Title = StringValue(title) f.Type = StringValue(feedType) f.Status = StringValue(status) f.PublishStatus = StringValue(publishStatus) f.Language = StringValue(language) if itemCount != nil { f.ItemCount = *itemCount } feedsByHost[host] = append(feedsByHost[host], f) } // Attach feeds to domains (feedsByHost is keyed by full domain) for i := range domains { fullHost := domains[i].Host if domains[i].TLD != "" { fullHost = domains[i].Host + "." + domains[i].TLD } if feeds, ok := feedsByHost[fullHost]; ok { domains[i].Feeds = feeds } } } } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(domains) } func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) { status := r.URL.Query().Get("status") if status == "" { http.Error(w, "status parameter required", http.StatusBadRequest) return } limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 500 { limit = 500 } } if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } rows, err := c.db.Query(` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE status = $1 ORDER BY tld ASC, host ASC LIMIT $2 OFFSET $3 `, status, limit, offset) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type DomainInfo struct { Host string `json:"host"` TLD string `json:"tld"` Status string `json:"status"` LastError string `json:"last_error,omitempty"` FeedCount int `json:"feed_count"` } var domains []DomainInfo for rows.Next() { var d DomainInfo var tld, lastError *string if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil { continue } d.TLD = StringValue(tld) d.LastError = StringValue(lastError) domains = append(domains, d) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(domains) } func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 500 { limit = 500 } } if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } rows, err := c.db.Query(` SELECT url, title, type, status, last_error, item_count, publish_status, language FROM feeds WHERE source_host = $1 ORDER BY url ASC LIMIT $2 OFFSET $3 `, host, limit, offset) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type FeedInfo struct { URL string `json:"url"` Title string `json:"title"` Type string `json:"type"` Status string `json:"status,omitempty"` LastError string `json:"last_error,omitempty"` ItemCount int `json:"item_count,omitempty"` PublishStatus string `json:"publish_status,omitempty"` Language string `json:"language,omitempty"` } var feeds []FeedInfo for rows.Next() { var f FeedInfo var title, status, lastError, publishStatus, language *string var itemCount *int if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil { continue } f.Title = StringValue(title) f.Status = StringValue(status) f.LastError = StringValue(lastError) f.PublishStatus = StringValue(publishStatus) f.Language = StringValue(language) if itemCount != nil { f.ItemCount = *itemCount } feeds = append(feeds, f) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(feeds) } // handleAPISetDomainStatus sets the status for a domain // status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop') func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") status := r.URL.Query().Get("status") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } if status != "hold" && status != "pass" && status != "skip" { http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest) return } host = normalizeHost(host) // Setting to 'skip' triggers takedown (hide content but preserve data) if status == "skip" { result := c.skipDomain(host) if result.Error != "" { http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(result) return } // When setting to pass, clear any last_error var err error strippedHost := stripTLD(host) tld := getTLD(host) if status == "pass" { _, err = c.db.Exec(` UPDATE domains SET status = $1, last_error = NULL WHERE host = $2 AND tld = $3 `, status, strippedHost, tld) } else { _, err = c.db.Exec(` UPDATE domains SET status = $1 WHERE host = $2 AND tld = $3 `, status, strippedHost, tld) } if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]string{ "host": host, "status": status, }) } func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } _, err := c.db.Exec(` UPDATE domains SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL WHERE host = $1 AND tld = $2 `, stripTLD(host), getTLD(host)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host}) } // handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists) func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } host = normalizeHost(host) // Add domain if it doesn't exist, or reset to pass for crawling _, err := c.db.Exec(` INSERT INTO domains (host, status, tld) VALUES ($1, 'pass', $2) ON CONFLICT(host, tld) DO UPDATE SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL `, stripTLD(host), getTLD(host)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } // Crawl synchronously fmt.Printf("Priority crawl: %s\n", host) feedsFound, crawlErr := c.feedCrawl(host) errStr := "" if crawlErr != nil { errStr = crawlErr.Error() } // Mark as crawled c.markDomainCrawled(stripTLD(host), getTLD(host), feedsFound, errStr) // Get the feeds we found feeds, _ := c.GetFeedsByHost(host) type FeedSummary struct { URL string `json:"url"` Title string `json:"title"` Type string `json:"type"` Category string `json:"category"` Status string `json:"status"` } var feedSummaries []FeedSummary for _, f := range feeds { feedSummaries = append(feedSummaries, FeedSummary{ URL: f.URL, Title: f.Title, Type: f.Type, Category: f.Category, Status: f.Status, }) } result := map[string]interface{}{ "host": host, "feeds_found": feedsFound, "feeds": feedSummaries, } if crawlErr != nil { result["error"] = crawlErr.Error() } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(result) } // handleAPIFilter handles flexible filtering with stackable parameters func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) { tld := r.URL.Query().Get("tld") domain := r.URL.Query().Get("domain") feedStatus := r.URL.Query().Get("feedStatus") domainStatus := r.URL.Query().Get("domainStatus") languages := r.URL.Query().Get("languages") // comma-separated list show := r.URL.Query().Get("show") // "feeds" or "domains" sort := r.URL.Query().Get("sort") // "alpha" or "feeds" limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 500 { limit = 500 } } if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } // Parse languages into slice var langList []string if languages != "" { for _, lang := range strings.Split(languages, ",") { lang = strings.TrimSpace(lang) if lang != "" { langList = append(langList, lang) } } } // Determine what to show based on filters if show == "" { if feedStatus != "" || domain != "" || len(langList) > 0 { show = "feeds" } else { show = "domains" } } if show == "feeds" { c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset) } else { c.filterDomains(w, tld, domainStatus, sort, limit, offset) } } func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) { var args []interface{} argNum := 1 query := ` SELECT host, tld, status, last_error, feeds_found FROM domains WHERE 1=1` if tld != "" { query += fmt.Sprintf(" AND tld = $%d", argNum) args = append(args, tld) argNum++ } if status != "" { query += fmt.Sprintf(" AND status = $%d", argNum) args = append(args, status) argNum++ } // Sort by feed count descending or alphabetically if sort == "feeds" { query += fmt.Sprintf(" ORDER BY feeds_found DESC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) } else { query += fmt.Sprintf(" ORDER BY tld ASC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) } args = append(args, limit, offset) rows, err := c.db.Query(query, args...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type DomainInfo struct { Host string `json:"host"` TLD string `json:"tld"` Status string `json:"status"` LastError string `json:"last_error,omitempty"` FeedCount int `json:"feed_count"` } var domains []DomainInfo for rows.Next() { var d DomainInfo var tldVal, lastError *string if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil { continue } d.TLD = StringValue(tldVal) d.LastError = StringValue(lastError) domains = append(domains, d) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]interface{}{ "type": "domains", "data": domains, }) } func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) { tld := r.URL.Query().Get("tld") if tld == "" { http.Error(w, "tld parameter required", http.StatusBadRequest) return } limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { fmt.Sscanf(l, "%d", &limit) if limit > 500 { limit = 500 } } if o := r.URL.Query().Get("offset"); o != "" { fmt.Sscanf(o, "%d", &offset) } rows, err := c.db.Query(` SELECT host, status, last_error, feeds_found FROM domains WHERE tld = $1 ORDER BY host ASC LIMIT $2 OFFSET $3 `, tld, limit, offset) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type DomainInfo struct { Host string `json:"host"` Status string `json:"status"` LastError string `json:"last_error,omitempty"` FeedCount int `json:"feed_count"` } var domains []DomainInfo for rows.Next() { var d DomainInfo var lastError *string if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil { continue } d.LastError = StringValue(lastError) domains = append(domains, d) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(domains) } func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { status := r.URL.Query().Get("status") // domain status: pass, skip, hold, dead feedMode := r.URL.Query().Get("feedMode") // include or exclude feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated: pass,skip,hold,dead feedTypes := r.URL.Query().Get("feedTypes") // comma-separated: rss,atom,json,unknown,empty search := r.URL.Query().Get("search") // search query // Parse comma-separated values var statusList, typeList []string if feedStatuses != "" { statusList = strings.Split(feedStatuses, ",") } if feedTypes != "" { typeList = strings.Split(feedTypes, ",") } var rows pgx.Rows var err error // If feed filter is specified, query from feeds table instead if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" { // Build query to get TLDs from feeds query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL` args := []interface{}{} argNum := 1 // Handle status filters (publish_status for pass/skip/hold/dead) if len(statusList) > 0 { if feedMode == "exclude" { query += fmt.Sprintf(" AND (publish_status IS NULL OR publish_status NOT IN (SELECT unnest($%d::text[])))", argNum) } else { query += fmt.Sprintf(" AND publish_status IN (SELECT unnest($%d::text[]))", argNum) } args = append(args, statusList) argNum++ } // Handle type filters (including special "empty" type) if len(typeList) > 0 { hasEmpty := false var regularTypes []string for _, t := range typeList { if t == "empty" { hasEmpty = true } else { regularTypes = append(regularTypes, t) } } if feedMode == "exclude" { // Exclude mode: exclude these types if len(regularTypes) > 0 && hasEmpty { query += fmt.Sprintf(" AND type NOT IN (SELECT unnest($%d::text[])) AND item_count > 0", argNum) args = append(args, regularTypes) argNum++ } else if len(regularTypes) > 0 { query += fmt.Sprintf(" AND (type IS NULL OR type NOT IN (SELECT unnest($%d::text[])))", argNum) args = append(args, regularTypes) argNum++ } else if hasEmpty { query += " AND item_count > 0" } } else { // Include mode: include these types if len(regularTypes) > 0 && hasEmpty { query += fmt.Sprintf(" AND (type IN (SELECT unnest($%d::text[])) OR item_count IS NULL OR item_count = 0)", argNum) args = append(args, regularTypes) argNum++ } else if len(regularTypes) > 0 { query += fmt.Sprintf(" AND type IN (SELECT unnest($%d::text[]))", argNum) args = append(args, regularTypes) argNum++ } else if hasEmpty { query += " AND (item_count IS NULL OR item_count = 0)" } } } if search != "" { sq := parseSearchPrefix(search) searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) var tldFilter string var exactMatch bool hostSearchPattern := searchPattern if sq.Type == "domain" { hostPattern, detectedTLD := parseSearchTerm(sq.Pattern) if detectedTLD != "" { tldFilter = detectedTLD exactMatch = true hostSearchPattern = "%" + strings.ToLower(hostPattern) + "%" } } switch sq.Type { case "domain": // Search domain names if exactMatch && tldFilter != "" { // d:npr.org -> exact match (source_host = 'npr.org') query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum) args = append(args, strings.ToLower(sq.Pattern)) } else if tldFilter != "" { query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1) args = append(args, tldFilter, hostSearchPattern) } else { query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum) args = append(args, hostSearchPattern) } case "url": query += fmt.Sprintf(" AND LOWER(url) LIKE $%d", argNum) args = append(args, searchPattern) case "title": query += fmt.Sprintf(" AND LOWER(title) LIKE $%d", argNum) args = append(args, searchPattern) case "description": query += fmt.Sprintf(" AND LOWER(description) LIKE $%d", argNum) args = append(args, searchPattern) case "item": query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = feeds.url AND LOWER(i.title) LIKE $%d)", argNum) args = append(args, searchPattern) default: // "all" - search domains and feeds (NOT items - use i: prefix for item search) // Also include exact domain match if pattern looks like a domain if sq.DomainHost != "" && sq.DomainTLD != "" { fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) query += fmt.Sprintf(` AND ( LOWER(source_host) LIKE $%d OR LOWER(url) LIKE $%d OR LOWER(title) LIKE $%d OR LOWER(description) LIKE $%d OR LOWER(source_host) = $%d )`, argNum, argNum, argNum, argNum, argNum+1) args = append(args, searchPattern, fullDomain) } else { query += fmt.Sprintf(` AND ( LOWER(source_host) LIKE $%d OR LOWER(url) LIKE $%d OR LOWER(title) LIKE $%d OR LOWER(description) LIKE $%d )`, argNum, argNum, argNum, argNum) args = append(args, searchPattern) } } } query += " GROUP BY tld ORDER BY tld ASC" rows, err = c.db.Query(query, args...) } else if search != "" { // Parse search prefix for type-specific searching sq := parseSearchPrefix(search) // Use the helper to build the TLD search query query, args := buildTLDSearchQuery(sq) rows, err = c.db.Query(query, args...) } else if status != "" { // TLDs filtered by domain status rows, err = c.db.Query(` SELECT tld::text as tld, COUNT(*) as domain_count FROM domains WHERE tld IS NOT NULL AND status = $1 GROUP BY tld HAVING COUNT(*) > 0 ORDER BY tld ASC `, status) } else { // All TLDs from enum with domain counts rows, err = c.db.Query(` SELECT e.enumlabel as tld, COALESCE(d.cnt, 0) as domain_count FROM pg_enum e LEFT JOIN ( SELECT tld::text as tld, COUNT(*) as cnt FROM domains GROUP BY tld ) d ON e.enumlabel = d.tld WHERE e.enumtypid = 'tld_enum'::regtype ORDER BY e.enumlabel ASC `) } if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } defer rows.Close() type TLDInfo struct { TLD string `json:"tld"` DomainCount int `json:"domain_count"` } var tlds []TLDInfo for rows.Next() { var t TLDInfo if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil { continue } tlds = append(tlds, t) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(tlds) } func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) { tld := r.URL.Query().Get("tld") if tld == "" { http.Error(w, "tld parameter required", http.StatusBadRequest) return } search := r.URL.Query().Get("search") stats := map[string]interface{}{ "tld": tld, } // Build WHERE clause based on whether search is provided var domainWhere, feedWhere string var domainArgs, feedArgs []interface{} if search != "" { // Parse search prefix for type-specific searching sq := parseSearchPrefix(search) searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" // For domain searches, check for exact match if sq.Type == "domain" { hostPart, detectedTLD := parseSearchTerm(sq.Pattern) if detectedTLD != "" { // d:npr.org -> exact match for host "npr" in specified TLD domainWhere = "tld = $1 AND lower(host) = $2" domainArgs = []interface{}{tld, strings.ToLower(hostPart)} feedWhere = "tld = $1 AND lower(source_host) = $2" feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)} } else { // d:npr -> pattern match in specified TLD domainWhere = "tld = $1 AND lower(host) LIKE $2" domainArgs = []interface{}{tld, searchPattern} feedWhere = "tld = $1 AND lower(source_host) LIKE $2" feedArgs = []interface{}{tld, searchPattern} } } else { // Other search types - pattern match domainWhere = "tld = $1 AND lower(host) LIKE $2" domainArgs = []interface{}{tld, searchPattern} feedWhere = "tld = $1 AND lower(source_host) LIKE $2" feedArgs = []interface{}{tld, searchPattern} } stats["search"] = search } else { // Filter by TLD only domainWhere = "tld = $1" domainArgs = []interface{}{tld} feedWhere = "tld = $1" feedArgs = []interface{}{tld} } // Domain stats by status var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE `+domainWhere, domainArgs...).Scan(&totalDomains) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } stats["total_domains"] = totalDomains rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for rows.Next() { var status string var count int if err := rows.Scan(&status, &count); err != nil { continue } switch status { case "pass": passDomains = count case "skip": skipDomains = count case "hold": holdDomains = count case "dead": deadDomains = count } } rows.Close() stats["pass_domains"] = passDomains stats["skip_domains"] = skipDomains stats["hold_domains"] = holdDomains stats["dead_domains"] = deadDomains // Feed stats var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE `+feedWhere, feedArgs...).Scan(&totalFeeds) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } stats["total_feeds"] = totalFeeds // Feed status counts statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for statusRows.Next() { var status string var count int if err := statusRows.Scan(&status, &count); err != nil { continue } switch status { case "pass": passFeeds = count case "skip": skipFeeds = count case "hold": holdFeeds = count case "dead": deadFeeds = count } } statusRows.Close() stats["pass_feeds"] = passFeeds stats["skip_feeds"] = skipFeeds stats["hold_feeds"] = holdFeeds stats["dead_feeds"] = deadFeeds // Empty feeds count c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds) stats["empty_feeds"] = emptyFeeds // Feed type counts typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for typeRows.Next() { var feedType string var count int if err := typeRows.Scan(&feedType, &count); err != nil { continue } switch feedType { case "rss": rssFeeds = count case "atom": atomFeeds = count case "json": jsonFeeds = count default: unknownFeeds += count } } typeRows.Close() stats["rss_feeds"] = rssFeeds stats["atom_feeds"] = atomFeeds stats["json_feeds"] = jsonFeeds stats["unknown_feeds"] = unknownFeeds w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(stats) } func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) { search := r.URL.Query().Get("search") if search == "" { http.Error(w, "search parameter required", http.StatusBadRequest) return } // Parse search prefix for type-specific searching sq := parseSearchPrefix(search) searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) var tldFilter, hostPart string var exactMatch bool if sq.Type == "domain" { hostPart, tldFilter = parseSearchTerm(sq.Pattern) if tldFilter != "" { searchPattern = "%" + strings.ToLower(hostPart) + "%" exactMatch = true } } stats := map[string]interface{}{} // Build WHERE clause based on search type var domainWhere, feedWhere string var domainArgs, feedArgs []interface{} switch sq.Type { case "domain": if exactMatch && tldFilter != "" { // d:npr.org -> exact match domainWhere = "tld = $1 AND LOWER(host) = $2" domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)} feedWhere = "LOWER(source_host) = $1" feedArgs = []interface{}{strings.ToLower(sq.Pattern)} } else if tldFilter != "" { domainWhere = "tld = $1 AND LOWER(host) LIKE $2" domainArgs = []interface{}{tldFilter, searchPattern} feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2" feedArgs = []interface{}{tldFilter, searchPattern} } else { domainWhere = "LOWER(host) LIKE $1" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(source_host) LIKE $1" feedArgs = []interface{}{searchPattern} } case "url": domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(url) LIKE $1" feedArgs = []interface{}{searchPattern} case "title": domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(title) LIKE $1" feedArgs = []interface{}{searchPattern} case "description": domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(description) LIKE $1" feedArgs = []interface{}{searchPattern} case "item": domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)" feedArgs = []interface{}{searchPattern} default: // "all" - search domains and feeds (NOT items - use i: prefix for item search) // Also include exact domain match if pattern looks like a domain if sq.DomainHost != "" && sq.DomainTLD != "" { domainWhere = `( LOWER(host) LIKE $1 OR (LOWER(host) = $2 AND LOWER(tld) = $3) OR EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 )) )` domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) feedWhere = `( LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2 )` feedArgs = []interface{}{searchPattern, fullDomain} } else { domainWhere = `( LOWER(host) LIKE $1 OR EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 )) )` domainArgs = []interface{}{searchPattern} feedWhere = `( LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 )` feedArgs = []interface{}{searchPattern} } } // Count matching domains by status var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for rows.Next() { var status string var count int if err := rows.Scan(&status, &count); err != nil { continue } totalDomains += count switch status { case "pass": passDomains = count case "skip": skipDomains = count case "hold": holdDomains = count case "dead": deadDomains = count } } rows.Close() stats["total_domains"] = totalDomains stats["pass_domains"] = passDomains stats["skip_domains"] = skipDomains stats["hold_domains"] = holdDomains stats["dead_domains"] = deadDomains // Count matching feeds by status var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for statusRows.Next() { var status string var count int if err := statusRows.Scan(&status, &count); err != nil { continue } totalFeeds += count switch status { case "pass": passFeeds = count case "skip": skipFeeds = count case "hold": holdFeeds = count case "dead": deadFeeds = count } } statusRows.Close() stats["total_feeds"] = totalFeeds stats["pass_feeds"] = passFeeds stats["skip_feeds"] = skipFeeds stats["hold_feeds"] = holdFeeds stats["dead_feeds"] = deadFeeds // Count empty feeds c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds) stats["empty_feeds"] = emptyFeeds typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } for typeRows.Next() { var feedType string var count int if err := typeRows.Scan(&feedType, &count); err != nil { continue } switch feedType { case "rss": rssFeeds = count case "atom": atomFeeds = count case "json": jsonFeeds = count default: unknownFeeds += count } } typeRows.Close() stats["rss_feeds"] = rssFeeds stats["atom_feeds"] = atomFeeds stats["json_feeds"] = jsonFeeds stats["unknown_feeds"] = unknownFeeds w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(stats) } // handleAPIDenyDomain skips a domain (takedown accounts, preserve data) func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } result := c.skipDomain(host) if result.Error != "" { http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(result) } // DomainActionResult contains the results of a domain action type DomainActionResult struct { Success bool `json:"success"` Host string `json:"host"` Action string `json:"action"` FeedsAffected int64 `json:"feeds_affected,omitempty"` ItemsDeleted int64 `json:"items_deleted,omitempty"` AccountsAffected int `json:"accounts_affected,omitempty"` AccountErrors []string `json:"account_errors,omitempty"` Error string `json:"error,omitempty"` } // getPDSCredentials loads PDS credentials from environment or pds.env file func getPDSCredentials() (pdsHost, pdsAdminPassword string) { pdsHost = os.Getenv("PDS_HOST") pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD") if pdsHost == "" || pdsAdminPassword == "" { if file, err := os.Open("pds.env"); err == nil { scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() if strings.HasPrefix(line, "PDS_HOST=") { pdsHost = strings.TrimPrefix(line, "PDS_HOST=") } else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") { pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=") } } file.Close() } } return } // getDomainDIDs returns all unique publish_account DIDs for a domain's feeds func (c *Crawler) getDomainDIDs(host string) []string { var dids []string rows, err := c.db.Query(` SELECT DISTINCT publish_account FROM feeds WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != '' `, host) if err == nil { defer rows.Close() for rows.Next() { var did string if err := rows.Scan(&did); err == nil && did != "" { dids = append(dids, did) } } } return dids } // skipDomain sets a domain to skip, takes down PDS accounts but preserves all data func (c *Crawler) skipDomain(host string) DomainActionResult { result := DomainActionResult{Host: host, Action: "skip"} pdsHost, pdsAdminPassword := getPDSCredentials() dids := c.getDomainDIDs(host) // Takedown PDS accounts (hide content but preserve data) if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { publisher := NewPublisher(pdsHost) for _, did := range dids { if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil { result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) } else { result.AccountsAffected++ } } } // Mark feeds as skipped (but don't delete) feedsAffected, err := c.db.Exec(` UPDATE feeds SET status = 'skip', publish_status = 'skip' WHERE source_host = $1 `, host) if err != nil { result.Error = fmt.Sprintf("failed to update feeds: %v", err) return result } result.FeedsAffected = feedsAffected // Update domain status to skip _, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result } result.Success = true return result } // handleAPIDropDomain permanently deletes all data for a skipped domain func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } // Verify domain is currently skipped var status string err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status) if err != nil { http.Error(w, "domain not found", http.StatusNotFound) return } if status != "skip" { http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest) return } result := c.dropDomain(host) if result.Error != "" { http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(result) } // dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts) func (c *Crawler) dropDomain(host string) DomainActionResult { result := DomainActionResult{Host: host, Action: "drop"} pdsHost, pdsAdminPassword := getPDSCredentials() dids := c.getDomainDIDs(host) // Delete PDS accounts if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { publisher := NewPublisher(pdsHost) for _, did := range dids { if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil { result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) } else { result.AccountsAffected++ } } } // Get feed URLs for this domain (needed to delete items) var feedURLs []string feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host) if err == nil { defer feedRows.Close() for feedRows.Next() { var url string if err := feedRows.Scan(&url); err == nil { feedURLs = append(feedURLs, url) } } } // Delete items for all feeds from this domain for _, feedURL := range feedURLs { deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL) if err == nil { result.ItemsDeleted += deleted } } // Delete all feeds from this domain feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host) if err != nil { result.Error = fmt.Sprintf("failed to delete feeds: %v", err) return result } result.FeedsAffected = feedsDeleted // Update domain status to drop _, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result } result.Success = true return result } // handleAPIUndenyDomain removes skip status from a domain (restores accounts) func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { http.Error(w, "host parameter required", http.StatusBadRequest) return } // Verify domain is currently skipped var status string err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status) if err != nil { http.Error(w, "domain not found", http.StatusNotFound) return } if status != "skip" { http.Error(w, "domain is not skipped", http.StatusBadRequest) return } result := c.restoreDomain(host) if result.Error != "" { http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(result) } // restoreDomain removes skip status and restores PDS accounts func (c *Crawler) restoreDomain(host string) DomainActionResult { result := DomainActionResult{Host: host, Action: "restore"} pdsHost, pdsAdminPassword := getPDSCredentials() dids := c.getDomainDIDs(host) // Restore PDS accounts (remove takedown) if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { publisher := NewPublisher(pdsHost) for _, did := range dids { if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil { result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) } else { result.AccountsAffected++ } } } // Restore feeds to pass status feedsAffected, err := c.db.Exec(` UPDATE feeds SET status = 'pass', publish_status = 'pass' WHERE source_host = $1 `, host) if err != nil { result.Error = fmt.Sprintf("failed to update feeds: %v", err) return result } result.FeedsAffected = feedsAffected // Update domain status back to pass _, err = c.db.Exec(` UPDATE domains SET status = 'pass', last_error = NULL WHERE host = $1 AND tld = $2 `, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result } result.Success = true return result }