From c6ec482d1fe22a7ea3e3fc50961170a5fb34d50b Mon Sep 17 00:00:00 2001 From: primal Date: Sun, 1 Feb 2026 19:00:50 -0500 Subject: [PATCH] Add exact domain matching for domain-like search queries When searching for patterns like "npr.org", the search now also matches the exact domain (host=npr, tld=org) in addition to the existing text search across domain names, feed URLs, titles, and descriptions. Co-Authored-By: Claude Opus 4.5 --- api_domains.go | 1224 +++++++++++++++++++++++++++++++++++++++++++----- util.go | 132 ++++++ 2 files changed, 1244 insertions(+), 112 deletions(-) diff --git a/api_domains.go b/api_domains.go index dd5c58a..1aa23c1 100644 --- a/api_domains.go +++ b/api_domains.go @@ -11,6 +11,238 @@ import ( "github.com/jackc/pgx/v5" ) +// buildTLDSearchQuery builds a query to get TLDs based on search type +// Returns (query, args) for the database query +func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { + pattern := "%" + strings.ToLower(sq.Pattern) + "%" + + switch sq.Type { + case "domain": + // Check if pattern includes TLD (e.g., d:npr.org -> exact match) + hostPart, tldFilter := parseSearchTerm(sq.Pattern) + if tldFilter != "" { + // Exact match - return just the matching TLD + return ` + SELECT tld::text as tld, COUNT(*) as domain_count + FROM domains + WHERE tld = $1 AND LOWER(host) = $2 + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{tldFilter, strings.ToLower(hostPart)} + } + // Pattern match - search all TLDs + return ` + SELECT tld::text as tld, COUNT(*) as domain_count + FROM domains + WHERE LOWER(host) LIKE $1 + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern} + + case "url": + // Search feed URL paths (after domain) + return ` + SELECT tld, COUNT(DISTINCT source_host) as domain_count + FROM feeds + WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern} + + case "title": + // Search feed titles + return ` + SELECT tld, COUNT(DISTINCT source_host) as domain_count + FROM feeds + WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern} + + case "description": + // Search feed descriptions + return ` + SELECT tld, COUNT(DISTINCT source_host) as domain_count + FROM feeds + WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern} + + case "item": + // Search item titles + return ` + SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count + FROM feeds f + INNER JOIN items i ON i.feed_url = f.url + WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1 + GROUP BY f.tld + ORDER BY f.tld ASC + `, []interface{}{pattern} + + default: + // "all" - search domains and feeds (NOT items - use i: prefix for item search) + // Also include exact domain match if pattern looks like a domain + if sq.DomainHost != "" && sq.DomainTLD != "" { + return ` + SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( + -- Domains matching host pattern + SELECT tld::text as tld, host || '.' || tld as source_host + FROM domains WHERE LOWER(host) LIKE $1 + UNION + -- Exact domain match + SELECT tld::text as tld, host || '.' || tld as source_host + FROM domains WHERE LOWER(host) = $2 AND LOWER(tld) = $3 + UNION + -- Feeds matching URL + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 + UNION + -- Feeds matching title + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 + UNION + -- Feeds matching description + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 + ) combined + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} + } + return ` + SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( + -- Domains matching host + SELECT tld::text as tld, host || '.' || tld as source_host + FROM domains WHERE LOWER(host) LIKE $1 + UNION + -- Feeds matching URL + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 + UNION + -- Feeds matching title + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 + UNION + -- Feeds matching description + SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 + ) combined + GROUP BY tld + ORDER BY tld ASC + `, []interface{}{pattern} + } +} + +// buildDomainSearchQuery builds a query to get domains based on search type +// Returns (whereClause, args, argNum) to append to the base query +func buildDomainSearchQuery(sq SearchQuery, tldFilter string, argNum int) (string, []interface{}, int) { + pattern := "%" + strings.ToLower(sq.Pattern) + "%" + var where string + var args []interface{} + + switch sq.Type { + case "domain": + if sq.ExactMatch && tldFilter != "" { + // d:npr.org -> exact match + where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) = $%d", argNum, argNum+1) + args = []interface{}{tldFilter, strings.ToLower(sq.Pattern)} + argNum += 2 + } else if tldFilter != "" { + where = fmt.Sprintf(" AND d.tld = $%d AND LOWER(d.host) LIKE $%d", argNum, argNum+1) + args = []interface{}{tldFilter, pattern} + argNum += 2 + } else { + where = fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum) + args = []interface{}{pattern} + argNum++ + } + + case "url": + where = fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum) + args = []interface{}{pattern} + argNum++ + if tldFilter != "" { + where += fmt.Sprintf(" AND d.tld = $%d", argNum) + args = append(args, tldFilter) + argNum++ + } + + case "title": + where = fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum) + args = []interface{}{pattern} + argNum++ + if tldFilter != "" { + where += fmt.Sprintf(" AND d.tld = $%d", argNum) + args = append(args, tldFilter) + argNum++ + } + + case "description": + where = fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum) + args = []interface{}{pattern} + argNum++ + if tldFilter != "" { + where += fmt.Sprintf(" AND d.tld = $%d", argNum) + args = append(args, tldFilter) + argNum++ + } + + case "item": + // Need to join items - handled separately + where = fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum) + args = []interface{}{pattern} + argNum++ + if tldFilter != "" { + where += fmt.Sprintf(" AND d.tld = $%d", argNum) + args = append(args, tldFilter) + argNum++ + } + + default: + // "all" - search everything, also include exact domain match if pattern looks like a domain + if tldFilter != "" { + if sq.DomainHost != "" && sq.DomainTLD != "" { + where = fmt.Sprintf(` AND d.tld = $%d AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d OR + (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) + )`, argNum, argNum+1, argNum+1, argNum+1, argNum+1, argNum+2, argNum+3) + args = []interface{}{tldFilter, pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} + argNum += 4 + } else { + where = fmt.Sprintf(` AND d.tld = $%d AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d + )`, argNum, argNum+1, argNum+1, argNum+1, argNum+1) + args = []interface{}{tldFilter, pattern} + argNum += 2 + } + } else { + if sq.DomainHost != "" && sq.DomainTLD != "" { + where = fmt.Sprintf(` AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d OR + (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) + )`, argNum, argNum, argNum, argNum, argNum+1, argNum+2) + args = []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} + argNum += 3 + } else { + where = fmt.Sprintf(` AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d + )`, argNum, argNum, argNum, argNum) + args = []interface{}{pattern} + argNum++ + } + } + } + + return where, args, argNum +} + func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) { offset := 0 limit := 100 @@ -51,6 +283,9 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { hasFeeds := r.URL.Query().Get("has_feeds") == "true" search := r.URL.Query().Get("search") tldFilter := r.URL.Query().Get("tld") + feedMode := r.URL.Query().Get("feedMode") // include or exclude + feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated + feedTypes := r.URL.Query().Get("feedTypes") // comma-separated limit := 100 offset := 0 if l := r.URL.Query().Get("limit"); l != "" { @@ -63,14 +298,175 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { fmt.Sscanf(o, "%d", &offset) } + // Parse comma-separated values + var statusList, typeList []string + if feedStatuses != "" { + statusList = strings.Split(feedStatuses, ",") + } + if feedTypes != "" { + typeList = strings.Split(feedTypes, ",") + } + + // Parse search prefix for type-specific searching + var searchQuery SearchQuery + if search != "" { + searchQuery = parseSearchPrefix(search) + // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) + // All other searches use the literal pattern + if searchQuery.Type == "domain" { + hostPart, detectedTLD := parseSearchTerm(searchQuery.Pattern) + if detectedTLD != "" { + searchQuery.Pattern = hostPart + searchQuery.ExactMatch = true // d:npr.org matches exactly npr.org + if tldFilter == "" { + tldFilter = detectedTLD + } + } + } + } + // First get domains var rows pgx.Rows var err error - if hasFeeds { + + // If feed filter is specified, query domains that have matching feeds + if len(statusList) > 0 || len(typeList) > 0 || feedMode != "" { + // Build dynamic query to get domains with matching feeds + query := ` + SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found + FROM domains d + INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld) + WHERE 1=1` + args := []interface{}{} + argNum := 1 + + if tldFilter != "" { + query += fmt.Sprintf(" AND d.tld = $%d", argNum) + args = append(args, tldFilter) + argNum++ + } + if status != "" { + query += fmt.Sprintf(" AND d.status = $%d", argNum) + args = append(args, status) + argNum++ + } + + // Handle status filters (publish_status for pass/skip/hold/dead) + if len(statusList) > 0 { + if feedMode == "exclude" { + query += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", argNum) + } else { + query += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", argNum) + } + args = append(args, statusList) + argNum++ + } + + // Handle type filters (including special "empty" type) + if len(typeList) > 0 { + hasEmpty := false + var regularTypes []string + for _, t := range typeList { + if t == "empty" { + hasEmpty = true + } else { + regularTypes = append(regularTypes, t) + } + } + + if feedMode == "exclude" { + // Exclude mode + if len(regularTypes) > 0 && hasEmpty { + query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", argNum) + args = append(args, regularTypes) + argNum++ + } else if len(regularTypes) > 0 { + query += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", argNum) + args = append(args, regularTypes) + argNum++ + } else if hasEmpty { + query += " AND f.item_count > 0" + } + } else { + // Include mode + if len(regularTypes) > 0 && hasEmpty { + query += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", argNum) + args = append(args, regularTypes) + argNum++ + } else if len(regularTypes) > 0 { + query += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", argNum) + args = append(args, regularTypes) + argNum++ + } else if hasEmpty { + query += " AND (f.item_count IS NULL OR f.item_count = 0)" + } + } + } + + if search != "" && searchQuery.Pattern != "" { + searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" + switch searchQuery.Type { + case "domain": + if searchQuery.ExactMatch { + // d:npr.org -> exact match for host "npr" (tld already filtered above) + query += fmt.Sprintf(" AND LOWER(d.host) = $%d", argNum) + args = append(args, strings.ToLower(searchQuery.Pattern)) + } else { + // d:npr -> pattern match + query += fmt.Sprintf(" AND LOWER(d.host) LIKE $%d", argNum) + args = append(args, searchPattern) + } + argNum++ + case "url": + query += fmt.Sprintf(" AND LOWER(f.url) LIKE $%d", argNum) + args = append(args, searchPattern) + argNum++ + case "title": + query += fmt.Sprintf(" AND LOWER(f.title) LIKE $%d", argNum) + args = append(args, searchPattern) + argNum++ + case "description": + query += fmt.Sprintf(" AND LOWER(f.description) LIKE $%d", argNum) + args = append(args, searchPattern) + argNum++ + case "item": + query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = f.url AND LOWER(i.title) LIKE $%d)", argNum) + args = append(args, searchPattern) + argNum++ + default: + // "all" - search domains and feeds (NOT items - use i: prefix for item search) + // Also include exact domain match if pattern looks like a domain + if searchQuery.DomainHost != "" && searchQuery.DomainTLD != "" { + query += fmt.Sprintf(` AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d OR + (LOWER(d.host) = $%d AND LOWER(d.tld) = $%d) + )`, argNum, argNum, argNum, argNum, argNum+1, argNum+2) + args = append(args, searchPattern, strings.ToLower(searchQuery.DomainHost), strings.ToLower(searchQuery.DomainTLD)) + argNum += 3 + } else { + query += fmt.Sprintf(` AND ( + LOWER(d.host) LIKE $%d OR + LOWER(f.url) LIKE $%d OR + LOWER(f.title) LIKE $%d OR + LOWER(f.description) LIKE $%d + )`, argNum, argNum, argNum, argNum) + args = append(args, searchPattern) + argNum++ + } + } + } + query += fmt.Sprintf(" ORDER BY d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) + args = append(args, limit, offset) + + rows, err = c.db.Query(query, args...) + } else if hasFeeds { // Only domains with feeds searchPattern := "%" + strings.ToLower(search) + "%" - if tldFilter != "" { - // Filter by specific TLD + if tldFilter != "" && status != "" { + // Filter by specific TLD and status rows, err = c.db.Query(` SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d @@ -79,25 +475,38 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { FROM feeds WHERE item_count > 0 GROUP BY source_host - ) f ON d.host = f.source_host - WHERE d.status != 'skip' AND d.tld = $1 + ) f ON (d.host || '.' || d.tld) = f.source_host + WHERE d.tld = $1 AND d.status = $2 ORDER BY d.host ASC - LIMIT $2 OFFSET $3 - `, tldFilter, limit, offset) - } else if search != "" { - // Search in domain host or feed title/url + LIMIT $3 OFFSET $4 + `, tldFilter, status, limit, offset) + } else if tldFilter != "" { + // Filter by specific TLD only (exclude 'skip' by default) rows, err = c.db.Query(` - SELECT DISTINCT d.host, d.tld, d.status, d.last_error, f.feed_count + SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( SELECT source_host, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 GROUP BY source_host - ) f ON d.host = f.source_host - LEFT JOIN feeds fe ON d.host = fe.source_host - WHERE d.status != 'skip' - AND (LOWER(d.host) LIKE $1 OR LOWER(fe.title) LIKE $1 OR LOWER(fe.url) LIKE $1) + ) f ON (d.host || '.' || d.tld) = f.source_host + WHERE d.status != 'skip' AND d.tld = $1 + ORDER BY d.host ASC + LIMIT $2 OFFSET $3 + `, tldFilter, limit, offset) + } else if search != "" { + // Search in domain host only (uses trigram index) + rows, err = c.db.Query(` + SELECT d.host, d.tld, d.status, d.last_error, f.feed_count + FROM domains d + INNER JOIN ( + SELECT source_host, COUNT(*) as feed_count + FROM feeds + WHERE item_count > 0 + GROUP BY source_host + ) f ON (d.host || '.' || d.tld) = f.source_host + WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 `, searchPattern, limit, offset) @@ -110,7 +519,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { FROM feeds WHERE item_count > 0 GROUP BY source_host - ) f ON d.host = f.source_host + ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status = $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 @@ -125,37 +534,85 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { FROM feeds WHERE item_count > 0 GROUP BY source_host - ) f ON d.host = f.source_host + ) f ON (d.host || '.' || d.tld) = f.source_host WHERE d.status != 'skip' ORDER BY d.tld ASC, d.host ASC LIMIT $1 OFFSET $2 `, limit, offset) } + } else if tldFilter != "" && search != "" && status != "" { + // Filter by TLD, status, and search + if searchQuery.ExactMatch { + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 AND status = $2 AND LOWER(host) = $3 + ORDER BY host ASC + LIMIT $4 OFFSET $5 + `, tldFilter, status, strings.ToLower(searchQuery.Pattern), limit, offset) + } else { + searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 AND status = $2 AND LOWER(host) LIKE $3 + ORDER BY host ASC + LIMIT $4 OFFSET $5 + `, tldFilter, status, searchPattern, limit, offset) + } + } else if tldFilter != "" && search != "" { + // Filter by TLD and search + if searchQuery.ExactMatch { + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 AND LOWER(host) = $2 + ORDER BY host ASC + LIMIT $3 OFFSET $4 + `, tldFilter, strings.ToLower(searchQuery.Pattern), limit, offset) + } else { + searchPattern := "%" + strings.ToLower(searchQuery.Pattern) + "%" + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 AND LOWER(host) LIKE $2 + ORDER BY host ASC + LIMIT $3 OFFSET $4 + `, tldFilter, searchPattern, limit, offset) + } + } else if tldFilter != "" && status != "" { + // Filter by TLD and status + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 AND status = $2 + ORDER BY host ASC + LIMIT $3 OFFSET $4 + `, tldFilter, status, limit, offset) + } else if tldFilter != "" { + // Filter by TLD only (show all statuses) + rows, err = c.db.Query(` + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE tld = $1 + ORDER BY host ASC + LIMIT $2 OFFSET $3 + `, tldFilter, limit, offset) } else if status != "" { rows, err = c.db.Query(` - SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count - FROM domains d - LEFT JOIN ( - SELECT source_host, COUNT(*) as feed_count - FROM feeds - GROUP BY source_host - ) f ON d.host = f.source_host - WHERE d.status = $1 - ORDER BY d.tld ASC, d.host ASC + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE status = $1 + ORDER BY tld ASC, host ASC LIMIT $2 OFFSET $3 `, status, limit, offset) } else { // Default: exclude 'skip' status domains rows, err = c.db.Query(` - SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count - FROM domains d - LEFT JOIN ( - SELECT source_host, COUNT(*) as feed_count - FROM feeds - GROUP BY source_host - ) f ON d.host = f.source_host - WHERE d.status != 'skip' - ORDER BY d.tld ASC, d.host ASC + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE status != 'skip' + ORDER BY tld ASC, host ASC LIMIT $1 OFFSET $2 `, limit, offset) } @@ -195,18 +652,78 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { d.TLD = StringValue(tld) d.LastError = StringValue(lastError) domains = append(domains, d) - hosts = append(hosts, d.Host) + // Build full domain for feed lookup (source_host = host.tld) + fullDomain := d.Host + if d.TLD != "" { + fullDomain = d.Host + "." + d.TLD + } + hosts = append(hosts, fullDomain) } // Now get feeds for these domains (with actual item count from items table) + // Apply the same feed filters used for domain selection if len(hosts) > 0 { - feedRows, err := c.db.Query(` + feedQuery := ` SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language, (SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count FROM feeds f - WHERE f.source_host = ANY($1) - ORDER BY f.source_host, f.url - `, hosts) + WHERE f.source_host = ANY($1)` + feedArgs := []interface{}{hosts} + feedArgNum := 2 + + // Apply feed status filters (publish_status for pass/skip/hold/dead) + if len(statusList) > 0 { + if feedMode == "exclude" { + feedQuery += fmt.Sprintf(" AND (f.publish_status IS NULL OR f.publish_status NOT IN (SELECT unnest($%d::text[])))", feedArgNum) + } else { + feedQuery += fmt.Sprintf(" AND f.publish_status IN (SELECT unnest($%d::text[]))", feedArgNum) + } + feedArgs = append(feedArgs, statusList) + feedArgNum++ + } + + // Apply feed type filters (including special "empty" type) + if len(typeList) > 0 { + hasEmpty := false + var regularTypes []string + for _, t := range typeList { + if t == "empty" { + hasEmpty = true + } else { + regularTypes = append(regularTypes, t) + } + } + + if feedMode == "exclude" { + if len(regularTypes) > 0 && hasEmpty { + feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[]))) AND f.item_count > 0", feedArgNum) + feedArgs = append(feedArgs, regularTypes) + feedArgNum++ + } else if len(regularTypes) > 0 { + feedQuery += fmt.Sprintf(" AND (f.type IS NULL OR f.type NOT IN (SELECT unnest($%d::text[])))", feedArgNum) + feedArgs = append(feedArgs, regularTypes) + feedArgNum++ + } else if hasEmpty { + feedQuery += " AND f.item_count > 0" + } + } else { + if len(regularTypes) > 0 && hasEmpty { + feedQuery += fmt.Sprintf(" AND (f.type IN (SELECT unnest($%d::text[])) OR f.item_count IS NULL OR f.item_count = 0)", feedArgNum) + feedArgs = append(feedArgs, regularTypes) + feedArgNum++ + } else if len(regularTypes) > 0 { + feedQuery += fmt.Sprintf(" AND f.type IN (SELECT unnest($%d::text[]))", feedArgNum) + feedArgs = append(feedArgs, regularTypes) + feedArgNum++ + } else if hasEmpty { + feedQuery += " AND (f.item_count IS NULL OR f.item_count = 0)" + } + } + } + + feedQuery += " ORDER BY f.source_host, f.url" + + feedRows, err := c.db.Query(feedQuery, feedArgs...) if err == nil { defer feedRows.Close() feedsByHost := make(map[string][]FeedInfo) @@ -228,9 +745,13 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { } feedsByHost[host] = append(feedsByHost[host], f) } - // Attach feeds to domains + // Attach feeds to domains (feedsByHost is keyed by full domain) for i := range domains { - if feeds, ok := feedsByHost[domains[i].Host]; ok { + fullHost := domains[i].Host + if domains[i].TLD != "" { + fullHost = domains[i].Host + "." + domains[i].TLD + } + if feeds, ok := feedsByHost[fullHost]; ok { domains[i].Feeds = feeds } } @@ -261,15 +782,10 @@ func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Reques } rows, err := c.db.Query(` - SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count - FROM domains d - LEFT JOIN ( - SELECT source_host, COUNT(*) as feed_count - FROM feeds - GROUP BY source_host - ) f ON d.host = f.source_host - WHERE d.status = $1 - ORDER BY d.tld ASC, d.host ASC + SELECT host, tld, status, last_error, feeds_found + FROM domains + WHERE status = $1 + ORDER BY tld ASC, host ASC LIMIT $2 OFFSET $3 `, status, limit, offset) if err != nil { @@ -399,16 +915,18 @@ func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Reques // When setting to pass, clear any last_error var err error + strippedHost := stripTLD(host) + tld := getTLD(host) if status == "pass" { _, err = c.db.Exec(` UPDATE domains SET status = $1, last_error = NULL - WHERE host = $2 - `, status, host) + WHERE host = $2 AND tld = $3 + `, status, strippedHost, tld) } else { _, err = c.db.Exec(` UPDATE domains SET status = $1 - WHERE host = $2 - `, status, host) + WHERE host = $2 AND tld = $3 + `, status, strippedHost, tld) } if err != nil { @@ -431,9 +949,9 @@ func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) } _, err := c.db.Exec(` - UPDATE domains SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL - WHERE host = $1 - `, host) + UPDATE domains SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL + WHERE host = $1 AND tld = $2 + `, stripTLD(host), getTLD(host)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -455,10 +973,10 @@ func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) // Add domain if it doesn't exist, or reset to pass for crawling _, err := c.db.Exec(` - INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, 'pass', NOW(), $2) - ON CONFLICT(host) DO UPDATE SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL - `, host, getTLD(host)) + INSERT INTO domains (host, status, tld) + VALUES ($1, 'pass', $2) + ON CONFLICT(host, tld) DO UPDATE SET status = 'pass', crawled_at = '0001-01-01 00:00:00', last_error = NULL + `, stripTLD(host), getTLD(host)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -466,7 +984,7 @@ func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) // Crawl synchronously fmt.Printf("Priority crawl: %s\n", host) - feedsFound, crawlErr := c.crawlHost(host) + feedsFound, crawlErr := c.feedCrawl(host) errStr := "" if crawlErr != nil { @@ -474,7 +992,7 @@ func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) } // Mark as crawled - c.markDomainCrawled(host, feedsFound, errStr) + c.markDomainCrawled(stripTLD(host), getTLD(host), feedsFound, errStr) // Get the feeds we found feeds, _ := c.GetFeedsByHost(host) @@ -563,31 +1081,26 @@ func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, var args []interface{} argNum := 1 query := ` - SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count - FROM domains d - LEFT JOIN ( - SELECT source_host, COUNT(*) as feed_count - FROM feeds - GROUP BY source_host - ) f ON d.host = f.source_host + SELECT host, tld, status, last_error, feeds_found + FROM domains WHERE 1=1` if tld != "" { - query += fmt.Sprintf(" AND d.tld = $%d", argNum) + query += fmt.Sprintf(" AND tld = $%d", argNum) args = append(args, tld) argNum++ } if status != "" { - query += fmt.Sprintf(" AND d.status = $%d", argNum) + query += fmt.Sprintf(" AND status = $%d", argNum) args = append(args, status) argNum++ } // Sort by feed count descending or alphabetically if sort == "feeds" { - query += fmt.Sprintf(" ORDER BY feed_count DESC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) + query += fmt.Sprintf(" ORDER BY feeds_found DESC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) } else { - query += fmt.Sprintf(" ORDER BY d.tld ASC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) + query += fmt.Sprintf(" ORDER BY tld ASC, host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1) } args = append(args, limit, offset) @@ -645,15 +1158,10 @@ func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) { } rows, err := c.db.Query(` - SELECT d.host, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count - FROM domains d - LEFT JOIN ( - SELECT source_host, COUNT(*) as feed_count - FROM feeds - GROUP BY source_host - ) f ON d.host = f.source_host - WHERE d.tld = $1 - ORDER BY d.tld ASC, d.host ASC + SELECT host, status, last_error, feeds_found + FROM domains + WHERE tld = $1 + ORDER BY host ASC LIMIT $2 OFFSET $3 `, tld, limit, offset) if err != nil { @@ -685,29 +1193,181 @@ func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) { } func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { - hasFeeds := r.URL.Query().Get("has_feeds") == "true" + status := r.URL.Query().Get("status") // domain status: pass, skip, hold, dead + feedMode := r.URL.Query().Get("feedMode") // include or exclude + feedStatuses := r.URL.Query().Get("feedStatuses") // comma-separated: pass,skip,hold,dead + feedTypes := r.URL.Query().Get("feedTypes") // comma-separated: rss,atom,json,unknown,empty + search := r.URL.Query().Get("search") // search query + + // Parse comma-separated values + var statusList, typeList []string + if feedStatuses != "" { + statusList = strings.Split(feedStatuses, ",") + } + if feedTypes != "" { + typeList = strings.Split(feedTypes, ",") + } var rows pgx.Rows var err error - if hasFeeds { - // Only TLDs that have domains with feeds + // If feed filter is specified, query from feeds table instead + if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" { + // Build query to get TLDs from feeds + query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL` + args := []interface{}{} + argNum := 1 + + // Handle status filters (publish_status for pass/skip/hold/dead) + if len(statusList) > 0 { + if feedMode == "exclude" { + query += fmt.Sprintf(" AND (publish_status IS NULL OR publish_status NOT IN (SELECT unnest($%d::text[])))", argNum) + } else { + query += fmt.Sprintf(" AND publish_status IN (SELECT unnest($%d::text[]))", argNum) + } + args = append(args, statusList) + argNum++ + } + + // Handle type filters (including special "empty" type) + if len(typeList) > 0 { + hasEmpty := false + var regularTypes []string + for _, t := range typeList { + if t == "empty" { + hasEmpty = true + } else { + regularTypes = append(regularTypes, t) + } + } + + if feedMode == "exclude" { + // Exclude mode: exclude these types + if len(regularTypes) > 0 && hasEmpty { + query += fmt.Sprintf(" AND type NOT IN (SELECT unnest($%d::text[])) AND item_count > 0", argNum) + args = append(args, regularTypes) + argNum++ + } else if len(regularTypes) > 0 { + query += fmt.Sprintf(" AND (type IS NULL OR type NOT IN (SELECT unnest($%d::text[])))", argNum) + args = append(args, regularTypes) + argNum++ + } else if hasEmpty { + query += " AND item_count > 0" + } + } else { + // Include mode: include these types + if len(regularTypes) > 0 && hasEmpty { + query += fmt.Sprintf(" AND (type IN (SELECT unnest($%d::text[])) OR item_count IS NULL OR item_count = 0)", argNum) + args = append(args, regularTypes) + argNum++ + } else if len(regularTypes) > 0 { + query += fmt.Sprintf(" AND type IN (SELECT unnest($%d::text[]))", argNum) + args = append(args, regularTypes) + argNum++ + } else if hasEmpty { + query += " AND (item_count IS NULL OR item_count = 0)" + } + } + } + + if search != "" { + sq := parseSearchPrefix(search) + searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" + + // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) + var tldFilter string + var exactMatch bool + hostSearchPattern := searchPattern + if sq.Type == "domain" { + hostPattern, detectedTLD := parseSearchTerm(sq.Pattern) + if detectedTLD != "" { + tldFilter = detectedTLD + exactMatch = true + hostSearchPattern = "%" + strings.ToLower(hostPattern) + "%" + } + } + + switch sq.Type { + case "domain": + // Search domain names + if exactMatch && tldFilter != "" { + // d:npr.org -> exact match (source_host = 'npr.org') + query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum) + args = append(args, strings.ToLower(sq.Pattern)) + } else if tldFilter != "" { + query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1) + args = append(args, tldFilter, hostSearchPattern) + } else { + query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum) + args = append(args, hostSearchPattern) + } + case "url": + query += fmt.Sprintf(" AND LOWER(url) LIKE $%d", argNum) + args = append(args, searchPattern) + case "title": + query += fmt.Sprintf(" AND LOWER(title) LIKE $%d", argNum) + args = append(args, searchPattern) + case "description": + query += fmt.Sprintf(" AND LOWER(description) LIKE $%d", argNum) + args = append(args, searchPattern) + case "item": + query += fmt.Sprintf(" AND EXISTS (SELECT 1 FROM items i WHERE i.feed_url = feeds.url AND LOWER(i.title) LIKE $%d)", argNum) + args = append(args, searchPattern) + default: + // "all" - search domains and feeds (NOT items - use i: prefix for item search) + // Also include exact domain match if pattern looks like a domain + if sq.DomainHost != "" && sq.DomainTLD != "" { + fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) + query += fmt.Sprintf(` AND ( + LOWER(source_host) LIKE $%d OR + LOWER(url) LIKE $%d OR + LOWER(title) LIKE $%d OR + LOWER(description) LIKE $%d OR + LOWER(source_host) = $%d + )`, argNum, argNum, argNum, argNum, argNum+1) + args = append(args, searchPattern, fullDomain) + } else { + query += fmt.Sprintf(` AND ( + LOWER(source_host) LIKE $%d OR + LOWER(url) LIKE $%d OR + LOWER(title) LIKE $%d OR + LOWER(description) LIKE $%d + )`, argNum, argNum, argNum, argNum) + args = append(args, searchPattern) + } + } + } + query += " GROUP BY tld ORDER BY tld ASC" + rows, err = c.db.Query(query, args...) + } else if search != "" { + // Parse search prefix for type-specific searching + sq := parseSearchPrefix(search) + + // Use the helper to build the TLD search query + query, args := buildTLDSearchQuery(sq) + rows, err = c.db.Query(query, args...) + } else if status != "" { + // TLDs filtered by domain status rows, err = c.db.Query(` - SELECT DISTINCT d.tld, COUNT(DISTINCT d.host) as domain_count - FROM domains d - INNER JOIN feeds f ON d.host = f.source_host - WHERE d.tld IS NOT NULL AND d.tld != '' - GROUP BY d.tld - ORDER BY d.tld ASC - `) - } else { - // All TLDs - rows, err = c.db.Query(` - SELECT tld, COUNT(*) as domain_count + SELECT tld::text as tld, COUNT(*) as domain_count FROM domains - WHERE tld IS NOT NULL AND tld != '' + WHERE tld IS NOT NULL AND status = $1 GROUP BY tld + HAVING COUNT(*) > 0 ORDER BY tld ASC + `, status) + } else { + // All TLDs from enum with domain counts + rows, err = c.db.Query(` + SELECT e.enumlabel as tld, COALESCE(d.cnt, 0) as domain_count + FROM pg_enum e + LEFT JOIN ( + SELECT tld::text as tld, COUNT(*) as cnt + FROM domains + GROUP BY tld + ) d ON e.enumlabel = d.tld + WHERE e.enumtypid = 'tld_enum'::regtype + ORDER BY e.enumlabel ASC `) } if err != nil { @@ -740,26 +1400,366 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) { http.Error(w, "tld parameter required", http.StatusBadRequest) return } + search := r.URL.Query().Get("search") - var domainCount, feedCount int - err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE tld = $1`, tld).Scan(&domainCount) + stats := map[string]interface{}{ + "tld": tld, + } + + // Build WHERE clause based on whether search is provided + var domainWhere, feedWhere string + var domainArgs, feedArgs []interface{} + + if search != "" { + // Parse search prefix for type-specific searching + sq := parseSearchPrefix(search) + searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" + + // For domain searches, check for exact match + if sq.Type == "domain" { + hostPart, detectedTLD := parseSearchTerm(sq.Pattern) + if detectedTLD != "" { + // d:npr.org -> exact match for host "npr" in specified TLD + domainWhere = "tld = $1 AND lower(host) = $2" + domainArgs = []interface{}{tld, strings.ToLower(hostPart)} + feedWhere = "tld = $1 AND lower(source_host) = $2" + feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)} + } else { + // d:npr -> pattern match in specified TLD + domainWhere = "tld = $1 AND lower(host) LIKE $2" + domainArgs = []interface{}{tld, searchPattern} + feedWhere = "tld = $1 AND lower(source_host) LIKE $2" + feedArgs = []interface{}{tld, searchPattern} + } + } else { + // Other search types - pattern match + domainWhere = "tld = $1 AND lower(host) LIKE $2" + domainArgs = []interface{}{tld, searchPattern} + feedWhere = "tld = $1 AND lower(source_host) LIKE $2" + feedArgs = []interface{}{tld, searchPattern} + } + stats["search"] = search + } else { + // Filter by TLD only + domainWhere = "tld = $1" + domainArgs = []interface{}{tld} + feedWhere = "tld = $1" + feedArgs = []interface{}{tld} + } + + // Domain stats by status + var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int + err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE `+domainWhere, domainArgs...).Scan(&totalDomains) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } + stats["total_domains"] = totalDomains - err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE tld = $1`, tld).Scan(&feedCount) + rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { + continue + } + switch status { + case "pass": + passDomains = count + case "skip": + skipDomains = count + case "hold": + holdDomains = count + case "dead": + deadDomains = count + } + } + rows.Close() + stats["pass_domains"] = passDomains + stats["skip_domains"] = skipDomains + stats["hold_domains"] = holdDomains + stats["dead_domains"] = deadDomains + + // Feed stats + var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int + var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int + + err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE `+feedWhere, feedArgs...).Scan(&totalFeeds) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + stats["total_feeds"] = totalFeeds + + // Feed status counts + statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + for statusRows.Next() { + var status string + var count int + if err := statusRows.Scan(&status, &count); err != nil { + continue + } + switch status { + case "pass": + passFeeds = count + case "skip": + skipFeeds = count + case "hold": + holdFeeds = count + case "dead": + deadFeeds = count + } + } + statusRows.Close() + stats["pass_feeds"] = passFeeds + stats["skip_feeds"] = skipFeeds + stats["hold_feeds"] = holdFeeds + stats["dead_feeds"] = deadFeeds + + // Empty feeds count + c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds) + stats["empty_feeds"] = emptyFeeds + + // Feed type counts + typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + for typeRows.Next() { + var feedType string + var count int + if err := typeRows.Scan(&feedType, &count); err != nil { + continue + } + switch feedType { + case "rss": + rssFeeds = count + case "atom": + atomFeeds = count + case "json": + jsonFeeds = count + default: + unknownFeeds += count + } + } + typeRows.Close() + stats["rss_feeds"] = rssFeeds + stats["atom_feeds"] = atomFeeds + stats["json_feeds"] = jsonFeeds + stats["unknown_feeds"] = unknownFeeds w.Header().Set("Content-Type", "application/json") - json.NewEncoder(w).Encode(map[string]interface{}{ - "tld": tld, - "domain_count": domainCount, - "feed_count": feedCount, - }) + json.NewEncoder(w).Encode(stats) +} + +func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) { + search := r.URL.Query().Get("search") + if search == "" { + http.Error(w, "search parameter required", http.StatusBadRequest) + return + } + + // Parse search prefix for type-specific searching + sq := parseSearchPrefix(search) + searchPattern := "%" + strings.ToLower(sq.Pattern) + "%" + + // Only extract TLD for domain searches (d:npr.org -> exact match for npr.org) + var tldFilter, hostPart string + var exactMatch bool + if sq.Type == "domain" { + hostPart, tldFilter = parseSearchTerm(sq.Pattern) + if tldFilter != "" { + searchPattern = "%" + strings.ToLower(hostPart) + "%" + exactMatch = true + } + } + + stats := map[string]interface{}{} + + // Build WHERE clause based on search type + var domainWhere, feedWhere string + var domainArgs, feedArgs []interface{} + + switch sq.Type { + case "domain": + if exactMatch && tldFilter != "" { + // d:npr.org -> exact match + domainWhere = "tld = $1 AND LOWER(host) = $2" + domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)} + feedWhere = "LOWER(source_host) = $1" + feedArgs = []interface{}{strings.ToLower(sq.Pattern)} + } else if tldFilter != "" { + domainWhere = "tld = $1 AND LOWER(host) LIKE $2" + domainArgs = []interface{}{tldFilter, searchPattern} + feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2" + feedArgs = []interface{}{tldFilter, searchPattern} + } else { + domainWhere = "LOWER(host) LIKE $1" + domainArgs = []interface{}{searchPattern} + feedWhere = "LOWER(source_host) LIKE $1" + feedArgs = []interface{}{searchPattern} + } + case "url": + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)" + domainArgs = []interface{}{searchPattern} + feedWhere = "LOWER(url) LIKE $1" + feedArgs = []interface{}{searchPattern} + case "title": + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)" + domainArgs = []interface{}{searchPattern} + feedWhere = "LOWER(title) LIKE $1" + feedArgs = []interface{}{searchPattern} + case "description": + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)" + domainArgs = []interface{}{searchPattern} + feedWhere = "LOWER(description) LIKE $1" + feedArgs = []interface{}{searchPattern} + case "item": + domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)" + domainArgs = []interface{}{searchPattern} + feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)" + feedArgs = []interface{}{searchPattern} + default: + // "all" - search domains and feeds (NOT items - use i: prefix for item search) + // Also include exact domain match if pattern looks like a domain + if sq.DomainHost != "" && sq.DomainTLD != "" { + domainWhere = `( + LOWER(host) LIKE $1 OR + (LOWER(host) = $2 AND LOWER(tld) = $3) OR + EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( + LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 + )) + )` + domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} + fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) + feedWhere = `( + LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2 + )` + feedArgs = []interface{}{searchPattern, fullDomain} + } else { + domainWhere = `( + LOWER(host) LIKE $1 OR + EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( + LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 + )) + )` + domainArgs = []interface{}{searchPattern} + feedWhere = `( + LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 + )` + feedArgs = []interface{}{searchPattern} + } + } + + // Count matching domains by status + var totalDomains, passDomains, skipDomains, holdDomains, deadDomains int + rows, err := c.db.Query(`SELECT status, COUNT(*) FROM domains WHERE `+domainWhere+` GROUP BY status`, domainArgs...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { + continue + } + totalDomains += count + switch status { + case "pass": + passDomains = count + case "skip": + skipDomains = count + case "hold": + holdDomains = count + case "dead": + deadDomains = count + } + } + rows.Close() + stats["total_domains"] = totalDomains + stats["pass_domains"] = passDomains + stats["skip_domains"] = skipDomains + stats["hold_domains"] = holdDomains + stats["dead_domains"] = deadDomains + + // Count matching feeds by status + var totalFeeds, passFeeds, skipFeeds, holdFeeds, deadFeeds, emptyFeeds int + var rssFeeds, atomFeeds, jsonFeeds, unknownFeeds int + + statusRows, err := c.db.Query(`SELECT COALESCE(status, 'hold'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY status`, feedArgs...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + for statusRows.Next() { + var status string + var count int + if err := statusRows.Scan(&status, &count); err != nil { + continue + } + totalFeeds += count + switch status { + case "pass": + passFeeds = count + case "skip": + skipFeeds = count + case "hold": + holdFeeds = count + case "dead": + deadFeeds = count + } + } + statusRows.Close() + stats["total_feeds"] = totalFeeds + stats["pass_feeds"] = passFeeds + stats["skip_feeds"] = skipFeeds + stats["hold_feeds"] = holdFeeds + stats["dead_feeds"] = deadFeeds + + // Count empty feeds + c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE (`+feedWhere+`) AND (item_count IS NULL OR item_count = 0)`, feedArgs...).Scan(&emptyFeeds) + stats["empty_feeds"] = emptyFeeds + + typeRows, err := c.db.Query(`SELECT COALESCE(type, 'unknown'), COUNT(*) FROM feeds WHERE `+feedWhere+` GROUP BY type`, feedArgs...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + for typeRows.Next() { + var feedType string + var count int + if err := typeRows.Scan(&feedType, &count); err != nil { + continue + } + switch feedType { + case "rss": + rssFeeds = count + case "atom": + atomFeeds = count + case "json": + jsonFeeds = count + default: + unknownFeeds += count + } + } + typeRows.Close() + stats["rss_feeds"] = rssFeeds + stats["atom_feeds"] = atomFeeds + stats["json_feeds"] = jsonFeeds + stats["unknown_feeds"] = unknownFeeds + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(stats) } // handleAPIDenyDomain skips a domain (takedown accounts, preserve data) @@ -863,7 +1863,7 @@ func (c *Crawler) skipDomain(host string) DomainActionResult { result.FeedsAffected = feedsAffected // Update domain status to skip - _, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host) + _, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result @@ -883,7 +1883,7 @@ func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) { // Verify domain is currently skipped var status string - err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status) + err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status) if err != nil { http.Error(w, "domain not found", http.StatusNotFound) return @@ -952,7 +1952,7 @@ func (c *Crawler) dropDomain(host string) DomainActionResult { result.FeedsAffected = feedsDeleted // Update domain status to drop - _, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1`, host) + _, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result @@ -972,7 +1972,7 @@ func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) // Verify domain is currently skipped var status string - err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status) + err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1 AND tld = $2`, stripTLD(host), getTLD(host)).Scan(&status) if err != nil { http.Error(w, "domain not found", http.StatusNotFound) return @@ -1025,8 +2025,8 @@ func (c *Crawler) restoreDomain(host string) DomainActionResult { // Update domain status back to pass _, err = c.db.Exec(` UPDATE domains SET status = 'pass', last_error = NULL - WHERE host = $1 - `, host) + WHERE host = $1 AND tld = $2 + `, stripTLD(host), getTLD(host)) if err != nil { result.Error = fmt.Sprintf("failed to update domain status: %v", err) return result diff --git a/util.go b/util.go index 32f153b..0dbe3a1 100644 --- a/util.go +++ b/util.go @@ -51,6 +51,25 @@ func getTLD(host string) string { return "" } +// stripTLD removes the TLD suffix from a hostname +// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example" +func stripTLD(host string) string { + idx := strings.LastIndex(host, ".") + if idx > 0 { + return host[:idx] + } + return host +} + +// fullHost reconstructs the full hostname from host and tld +// e.g., ("example", "com") -> "example.com" +func fullHost(host, tld string) string { + if tld == "" { + return host + } + return host + "." + tld +} + // makeAbsoluteURL resolves a relative URL against a base URL func makeAbsoluteURL(href, baseURL string) string { base, err := url.Parse(baseURL) @@ -66,6 +85,119 @@ func makeAbsoluteURL(href, baseURL string) string { return base.ResolveReference(link).String() } +// SearchQuery represents a parsed search with optional type prefix +type SearchQuery struct { + Type string // "all", "domain", "url", "title", "description", "item" + Pattern string // the search pattern (without prefix) + ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly) + // For "all" type searches that look like domains, these are populated for additional exact matching + DomainHost string // e.g., "npr" from "npr.org" + DomainTLD string // e.g., "org" from "npr.org" +} + +// parseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:" +// Returns SearchQuery with Type and Pattern +// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern), +// "url" (f:), "title" (t:), "description" (s:), "item" (i:) +func parseSearchPrefix(query string) SearchQuery { + query = strings.TrimSpace(query) + if query == "" { + return SearchQuery{Type: "all", Pattern: ""} + } + + // Check for prefixes (case-insensitive) + lower := strings.ToLower(query) + if strings.HasPrefix(lower, "a:") { + return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "d:") { + return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "f:") { + return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "t:") { + return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "s:") { + return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "i:") { + return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])} + } + + // For "all" type, check if pattern looks like a domain and extract host/tld + result := SearchQuery{Type: "all", Pattern: query} + if looksLikeDomain(query) { + host, tld := parseSearchTerm(query) + if tld != "" { + result.DomainHost = host + result.DomainTLD = tld + } + } + return result +} + +// looksLikeDomain checks if a query looks like a domain name +func looksLikeDomain(query string) bool { + if query == "" || strings.Contains(query, " ") { + return false + } + // Must have at least one dot + lastDot := strings.LastIndex(query, ".") + if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 { + return false + } + // TLD must be 2-6 lowercase letters + tld := query[lastDot+1:] + if len(tld) < 2 || len(tld) > 6 { + return false + } + for _, c := range tld { + if c < 'a' || c > 'z' { + if c < 'A' || c > 'Z' { + return false + } + } + } + return true +} + +// parseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter. +// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them. +// Returns (hostPattern, tldFilter) where tldFilter may be empty. +func parseSearchTerm(search string) (hostPattern, tldFilter string) { + search = strings.TrimSpace(search) + if search == "" { + return "", "" + } + + // Check if search contains a dot + lastDot := strings.LastIndex(search, ".") + if lastDot == -1 || lastDot == len(search)-1 { + // No dot or ends with dot - treat as host-only search + return search, "" + } + + // Extract potential TLD (part after last dot) + potentialTLD := strings.ToLower(search[lastDot+1:]) + hostPart := search[:lastDot] + + // Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs) + if len(potentialTLD) < 2 || len(potentialTLD) > 24 { + return search, "" + } + for _, c := range potentialTLD { + if c < 'a' || c > 'z' { + // Contains non-letter, not a TLD + return search, "" + } + } + + // Looks like a valid TLD pattern + return hostPart, potentialTLD +} + // shouldCrawl checks if a link should be crawled (same host as base) func shouldCrawl(link, baseURL string) bool { linkURL, err := url.Parse(link)