diff --git a/Dockerfile b/Dockerfile index 68d7fb2..4abc59c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ FROM ubuntu:latest WORKDIR /app # Install runtime dependencies -RUN apt-get update && apt-get install -y ca-certificates tzdata && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y ca-certificates tzdata curl wget && rm -rf /var/lib/apt/lists/* # Copy binary from builder COPY --from=builder /app/1440.news . diff --git a/api_domains.go b/api_domains.go index 1a9152a..da1c0f8 100644 --- a/api_domains.go +++ b/api_domains.go @@ -42,7 +42,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { case "url": // Search feed URL paths (after domain) return ` - SELECT tld, COUNT(DISTINCT source_host) as domain_count + SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 GROUP BY tld @@ -52,7 +52,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { case "title": // Search feed titles return ` - SELECT tld, COUNT(DISTINCT source_host) as domain_count + SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 GROUP BY tld @@ -62,7 +62,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { case "description": // Search feed descriptions return ` - SELECT tld, COUNT(DISTINCT source_host) as domain_count + SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 GROUP BY tld @@ -72,7 +72,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { case "item": // Search item titles return ` - SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count + SELECT f.tld, COUNT(DISTINCT f.domain_host || '.' || f.domain_tld) as domain_count FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1 @@ -85,7 +85,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { // Also include exact domain match if pattern looks like a domain if sq.DomainHost != "" && sq.DomainTLD != "" { return ` - SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( + SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM ( -- Domains matching host pattern SELECT tld::text as tld, host || '.' || tld as source_host FROM domains WHERE LOWER(host) LIKE $1 @@ -95,32 +95,32 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) { FROM domains WHERE LOWER(host) = $2 AND tld::text = $3 UNION -- Feeds matching URL - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1 UNION -- Feeds matching title - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1 UNION -- Feeds matching description - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1 ) combined GROUP BY tld ORDER BY tld ASC `, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} } return ` - SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM ( + SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM ( -- Domains matching host SELECT tld::text as tld, host || '.' || tld as source_host FROM domains WHERE LOWER(host) LIKE $1 UNION -- Feeds matching URL - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1 UNION -- Feeds matching title - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1 UNION -- Feeds matching description - SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1 + SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1 ) combined GROUP BY tld ORDER BY tld ASC @@ -335,7 +335,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { query := ` SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found FROM domains d - INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld) + INNER JOIN feeds f ON f.domain_host = d.host AND f.domain_tld = d.tld WHERE 1=1` args := []interface{}{} argNum := 1 @@ -471,11 +471,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( - SELECT source_host, COUNT(*) as feed_count + SELECT domain_host, domain_tld, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 - GROUP BY source_host - ) f ON (d.host || '.' || d.tld) = f.source_host + GROUP BY domain_host, domain_tld + ) f ON d.host = f.domain_host AND d.tld = f.domain_tld WHERE d.tld = $1 AND d.status = $2 ORDER BY d.host ASC LIMIT $3 OFFSET $4 @@ -486,11 +486,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( - SELECT source_host, COUNT(*) as feed_count + SELECT domain_host, domain_tld, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 - GROUP BY source_host - ) f ON (d.host || '.' || d.tld) = f.source_host + GROUP BY domain_host, domain_tld + ) f ON d.host = f.domain_host AND d.tld = f.domain_tld WHERE d.status != 'skip' AND d.tld = $1 ORDER BY d.host ASC LIMIT $2 OFFSET $3 @@ -501,11 +501,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( - SELECT source_host, COUNT(*) as feed_count + SELECT domain_host, domain_tld, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 - GROUP BY source_host - ) f ON (d.host || '.' || d.tld) = f.source_host + GROUP BY domain_host, domain_tld + ) f ON d.host = f.domain_host AND d.tld = f.domain_tld WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 @@ -515,11 +515,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( - SELECT source_host, COUNT(*) as feed_count + SELECT domain_host, domain_tld, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 - GROUP BY source_host - ) f ON (d.host || '.' || d.tld) = f.source_host + GROUP BY domain_host, domain_tld + ) f ON d.host = f.domain_host AND d.tld = f.domain_tld WHERE d.status = $1 ORDER BY d.tld ASC, d.host ASC LIMIT $2 OFFSET $3 @@ -530,11 +530,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { SELECT d.host, d.tld, d.status, d.last_error, f.feed_count FROM domains d INNER JOIN ( - SELECT source_host, COUNT(*) as feed_count + SELECT domain_host, domain_tld, COUNT(*) as feed_count FROM feeds WHERE item_count > 0 - GROUP BY source_host - ) f ON (d.host || '.' || d.tld) = f.source_host + GROUP BY domain_host, domain_tld + ) f ON d.host = f.domain_host AND d.tld = f.domain_tld WHERE d.status != 'skip' ORDER BY d.tld ASC, d.host ASC LIMIT $1 OFFSET $2 @@ -683,10 +683,10 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { // Apply the same feed filters used for domain selection if len(hosts) > 0 { feedQuery := ` - SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language, + SELECT f.domain_host || '.' || f.domain_tld as source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language, (SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count FROM feeds f - WHERE f.source_host = ANY($1)` + WHERE f.domain_host || '.' || f.domain_tld = ANY($1)` feedArgs := []interface{}{hosts} feedArgNum := 2 @@ -740,7 +740,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) { } } - feedQuery += " ORDER BY f.source_host, f.url" + feedQuery += " ORDER BY f.domain_host, f.domain_tld, f.url" feedRows, err := c.db.Query(feedQuery, feedArgs...) if err == nil { @@ -856,13 +856,17 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { fmt.Sscanf(o, "%d", &offset) } + // Parse host into domain_host and domain_tld + domainHost := stripTLD(host) + domainTLD := getTLD(host) + rows, err := c.db.Query(` SELECT url, title, type, status, last_error, item_count, publish_status, language FROM feeds - WHERE source_host = $1 + WHERE domain_host = $1 AND domain_tld = $2 ORDER BY url ASC - LIMIT $2 OFFSET $3 - `, host, limit, offset) + LIMIT $3 OFFSET $4 + `, domainHost, domainTLD, limit, offset) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -1233,7 +1237,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { // If feed filter is specified, query from feeds table instead if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" { // Build query to get TLDs from feeds - query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL` + query := `SELECT domain_tld as tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE domain_tld IS NOT NULL` args := []interface{}{} argNum := 1 @@ -1310,14 +1314,14 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { case "domain": // Search domain names if exactMatch && tldFilter != "" { - // d:npr.org -> exact match (source_host = 'npr.org') - query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum) + // d:npr.org -> exact match + query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) = $%d", argNum) args = append(args, strings.ToLower(sq.Pattern)) } else if tldFilter != "" { - query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1) + query += fmt.Sprintf(" AND domain_tld = $%d AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum, argNum+1) args = append(args, tldFilter, hostSearchPattern) } else { - query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum) + query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum) args = append(args, hostSearchPattern) } case "url": @@ -1338,16 +1342,16 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { if sq.DomainHost != "" && sq.DomainTLD != "" { fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) query += fmt.Sprintf(` AND ( - LOWER(source_host) LIKE $%d OR + LOWER(domain_host || '.' || domain_tld) LIKE $%d OR LOWER(url) LIKE $%d OR LOWER(title) LIKE $%d OR LOWER(description) LIKE $%d OR - LOWER(source_host) = $%d + LOWER(domain_host || '.' || domain_tld) = $%d )`, argNum, argNum, argNum, argNum, argNum+1) args = append(args, searchPattern, fullDomain) } else { query += fmt.Sprintf(` AND ( - LOWER(source_host) LIKE $%d OR + LOWER(domain_host || '.' || domain_tld) LIKE $%d OR LOWER(url) LIKE $%d OR LOWER(title) LIKE $%d OR LOWER(description) LIKE $%d @@ -1356,7 +1360,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) { } } } - query += " GROUP BY tld ORDER BY tld ASC" + query += " GROUP BY domain_tld ORDER BY domain_tld ASC" rows, err = c.db.Query(query, args...) } else if search != "" { // Parse search prefix for type-specific searching @@ -1441,20 +1445,20 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) { // d:npr.org -> exact match for host "npr" in specified TLD domainWhere = "tld = $1 AND lower(host) = $2" domainArgs = []interface{}{tld, strings.ToLower(hostPart)} - feedWhere = "tld = $1 AND lower(source_host) = $2" + feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) = $2" feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)} } else { // d:npr -> pattern match in specified TLD domainWhere = "tld = $1 AND lower(host) LIKE $2" domainArgs = []interface{}{tld, searchPattern} - feedWhere = "tld = $1 AND lower(source_host) LIKE $2" + feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2" feedArgs = []interface{}{tld, searchPattern} } } else { // Other search types - pattern match domainWhere = "tld = $1 AND lower(host) LIKE $2" domainArgs = []interface{}{tld, searchPattern} - feedWhere = "tld = $1 AND lower(source_host) LIKE $2" + feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2" feedArgs = []interface{}{tld, searchPattern} } stats["search"] = search @@ -1462,7 +1466,7 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) { // Filter by TLD only domainWhere = "tld = $1" domainArgs = []interface{}{tld} - feedWhere = "tld = $1" + feedWhere = "domain_tld = $1" feedArgs = []interface{}{tld} } @@ -1614,36 +1618,36 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) { // d:npr.org -> exact match domainWhere = "tld = $1 AND LOWER(host) = $2" domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)} - feedWhere = "LOWER(source_host) = $1" + feedWhere = "LOWER(domain_host || '.' || domain_tld) = $1" feedArgs = []interface{}{strings.ToLower(sq.Pattern)} } else if tldFilter != "" { domainWhere = "tld = $1 AND LOWER(host) LIKE $2" domainArgs = []interface{}{tldFilter, searchPattern} - feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2" + feedWhere = "domain_tld = $1 AND LOWER(domain_host || '.' || domain_tld) LIKE $2" feedArgs = []interface{}{tldFilter, searchPattern} } else { domainWhere = "LOWER(host) LIKE $1" domainArgs = []interface{}{searchPattern} - feedWhere = "LOWER(source_host) LIKE $1" + feedWhere = "LOWER(domain_host || '.' || domain_tld) LIKE $1" feedArgs = []interface{}{searchPattern} } case "url": - domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)" + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.url) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(url) LIKE $1" feedArgs = []interface{}{searchPattern} case "title": - domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)" + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.title) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(title) LIKE $1" feedArgs = []interface{}{searchPattern} case "description": - domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)" + domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.description) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "LOWER(description) LIKE $1" feedArgs = []interface{}{searchPattern} case "item": - domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)" + domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(i.title) LIKE $1)" domainArgs = []interface{}{searchPattern} feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)" feedArgs = []interface{}{searchPattern} @@ -1654,26 +1658,26 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) { domainWhere = `( LOWER(host) LIKE $1 OR (LOWER(host) = $2 AND tld::text = $3) OR - EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( + EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND ( LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 )) )` domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)} fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD) feedWhere = `( - LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2 + LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(domain_host || '.' || domain_tld) = $2 )` feedArgs = []interface{}{searchPattern, fullDomain} } else { domainWhere = `( LOWER(host) LIKE $1 OR - EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND ( + EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND ( LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1 )) )` domainArgs = []interface{}{searchPattern} feedWhere = `( - LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 + LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 )` feedArgs = []interface{}{searchPattern} } @@ -1834,11 +1838,13 @@ func getPDSCredentials() (pdsHost, pdsAdminPassword string) { // getDomainDIDs returns all unique publish_account DIDs for a domain's feeds func (c *Crawler) getDomainDIDs(host string) []string { + domainHost := stripTLD(host) + domainTLD := getTLD(host) var dids []string rows, err := c.db.Query(` SELECT DISTINCT publish_account FROM feeds - WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != '' - `, host) + WHERE domain_host = $1 AND domain_tld = $2 AND publish_account IS NOT NULL AND publish_account != '' + `, domainHost, domainTLD) if err == nil { defer rows.Close() for rows.Next() { @@ -1871,10 +1877,12 @@ func (c *Crawler) skipDomain(host string) DomainActionResult { } // Mark feeds as skipped (but don't delete) + domainHost := stripTLD(host) + domainTLD := getTLD(host) feedsAffected, err := c.db.Exec(` UPDATE feeds SET status = 'skip', publish_status = 'skip' - WHERE source_host = $1 - `, host) + WHERE domain_host = $1 AND domain_tld = $2 + `, domainHost, domainTLD) if err != nil { result.Error = fmt.Sprintf("failed to update feeds: %v", err) return result @@ -1942,8 +1950,10 @@ func (c *Crawler) dropDomain(host string) DomainActionResult { } // Get feed URLs for this domain (needed to delete items) + domainHost := stripTLD(host) + domainTLD := getTLD(host) var feedURLs []string - feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host) + feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD) if err == nil { defer feedRows.Close() for feedRows.Next() { @@ -1963,7 +1973,7 @@ func (c *Crawler) dropDomain(host string) DomainActionResult { } // Delete all feeds from this domain - feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host) + feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD) if err != nil { result.Error = fmt.Sprintf("failed to delete feeds: %v", err) return result @@ -2031,10 +2041,12 @@ func (c *Crawler) restoreDomain(host string) DomainActionResult { } // Restore feeds to pass status + domainHost := stripTLD(host) + domainTLD := getTLD(host) feedsAffected, err := c.db.Exec(` UPDATE feeds SET status = 'pass', publish_status = 'pass' - WHERE source_host = $1 - `, host) + WHERE domain_host = $1 AND domain_tld = $2 + `, domainHost, domainTLD) if err != nil { result.Error = fmt.Sprintf("failed to update feeds: %v", err) return result diff --git a/api_feeds.go b/api_feeds.go index 2467f1d..73b4bd4 100644 --- a/api_feeds.go +++ b/api_feeds.go @@ -154,7 +154,7 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) } rows, err := c.db.Query(` - SELECT url, title, type, source_host, tld, status, last_error, item_count + SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count FROM feeds WHERE status = $1 ORDER BY url ASC @@ -218,7 +218,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { var err error if publishStatus != "" { rows, err = c.db.Query(` - SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language + SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language FROM feeds WHERE publish_status = $1 ORDER BY url ASC @@ -226,7 +226,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) { `, publishStatus, limit, offset) } else { rows, err = c.db.Query(` - SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language + SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language FROM feeds ORDER BY url ASC LIMIT $1 OFFSET $2 @@ -279,19 +279,22 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, var args []interface{} argNum := 1 query := ` - SELECT url, title, type, category, source_host, tld, status, last_error, item_count, language + SELECT url, title, type, category, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, language FROM feeds WHERE 1=1` if tld != "" { - query += fmt.Sprintf(" AND tld = $%d", argNum) + query += fmt.Sprintf(" AND domain_tld = $%d", argNum) args = append(args, tld) argNum++ } if domain != "" { - query += fmt.Sprintf(" AND source_host = $%d", argNum) - args = append(args, domain) - argNum++ + // Parse domain into host and tld parts + domainHost := stripTLD(domain) + domainTLD := getTLD(domain) + query += fmt.Sprintf(" AND domain_host = $%d AND domain_tld = $%d", argNum, argNum+1) + args = append(args, domainHost, domainTLD) + argNum += 2 } if status != "" { query += fmt.Sprintf(" AND status = $%d", argNum) diff --git a/api_publish.go b/api_publish.go index 22606fa..176b8b4 100644 --- a/api_publish.go +++ b/api_publish.go @@ -150,7 +150,7 @@ func (c *Crawler) handleAPIPublishDenied(w http.ResponseWriter, r *http.Request) result = append(result, FeedDeniedInfo{ URL: f.URL, Title: f.Title, - SourceHost: f.SourceHost, + SourceHost: fullHost(f.DomainHost, f.DomainTLD), }) } @@ -193,7 +193,7 @@ func (c *Crawler) handleAPIPublishCandidates(w http.ResponseWriter, r *http.Requ URL: f.URL, Title: f.Title, Category: f.Category, - SourceHost: f.SourceHost, + SourceHost: fullHost(f.DomainHost, f.DomainTLD), ItemCount: f.ItemCount, DerivedHandle: DeriveHandleFromFeed(f.URL), }) @@ -346,9 +346,10 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error) // Set up profile feed, _ := c.getFeed(feedURL) if feed != nil { + sourceHost := fullHost(feed.DomainHost, feed.DomainTLD) displayName := feed.Title if displayName == "" { - displayName = feed.SourceHost + displayName = sourceHost } description := feed.Description if description == "" { @@ -366,7 +367,7 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error) // Try to fetch favicon var avatar *BlobRef - faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost) + faviconData, mimeType, err := FetchFaviconBytes(sourceHost) if err == nil && len(faviconData) > 0 { avatar, _ = publisher.UploadBlob(session, faviconData, mimeType) } @@ -819,15 +820,16 @@ func (c *Crawler) handleAPIPublishFeedFull(w http.ResponseWriter, r *http.Reques fmt.Printf("Created account: %s (%s)\n", session.Handle, session.DID) // Set up profile with feed title and favicon + sourceHost := fullHost(feed.DomainHost, feed.DomainTLD) displayName := feed.Title if displayName == "" { - displayName = feed.SourceHost + displayName = sourceHost } description := feed.Description // Try to fetch favicon for avatar var avatar *BlobRef - faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost) + faviconData, mimeType, err := FetchFaviconBytes(sourceHost) if err == nil && len(faviconData) > 0 { avatar, err = publisher.UploadBlob(session, faviconData, mimeType) if err != nil { diff --git a/api_search.go b/api_search.go index c457ada..4a9b003 100644 --- a/api_search.go +++ b/api_search.go @@ -138,17 +138,17 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { return url, sf, true } - // Search feeds by source_host (LIKE search for domain matching) + // Search feeds by domain_host (LIKE search for domain matching) // Use LOWER() to leverage trigram index lowerPattern := "%" + strings.ToLower(query) + "%" hostRows, err := c.db.Query(` SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld, item_count, oldest_item_date, newest_item_date, no_update FROM feeds - WHERE LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 + WHERE LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 LIMIT $2 `, lowerPattern, limit) if err == nil { @@ -168,7 +168,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { SELECT url, type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld, item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE search_vector @@ to_tsquery('english', $1) @@ -243,7 +243,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) { SELECT type, category, title, description, language, site_url, discovered_at, last_checked_at, next_check_at, last_build_date, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld, item_count, oldest_item_date, newest_item_date, no_update FROM feeds WHERE url = $1 `, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl, diff --git a/crawler.go b/crawler.go index 4d0fd4d..0bee146 100644 --- a/crawler.go +++ b/crawler.go @@ -2,6 +2,7 @@ package main import ( "context" + "crypto/tls" "encoding/json" "fmt" "io" @@ -42,16 +43,35 @@ func NewCrawler(connString string) (*Crawler, error) { return nil, fmt.Errorf("failed to open database: %v", err) } + // Custom transport with longer timeouts (HTTP/2 disabled for compatibility) + transport := &http.Transport{ + TLSClientConfig: &tls.Config{ + MinVersion: tls.VersionTLS12, + NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility + }, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + ForceAttemptHTTP2: false, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 30 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + ResponseHeaderTimeout: 60 * time.Second, + } + return &Crawler{ MaxDepth: 10, MaxPagesPerHost: 10, - Timeout: 10 * time.Second, - UserAgent: "FeedCrawler/1.0", + Timeout: 60 * time.Second, + UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)", startTime: time.Now(), db: db, shutdownCh: make(chan struct{}), client: &http.Client{ - Timeout: 10 * time.Second, + Timeout: 60 * time.Second, + Transport: transport, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("stopped after 10 redirects") @@ -347,7 +367,7 @@ type FeedInfo struct { func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo { var title, description, siteURL, sourceHost *string err := c.db.QueryRow(` - SELECT title, description, site_url, source_host FROM feeds WHERE url = $1 + SELECT title, description, site_url, domain_host || '.' || domain_tld as source_host FROM feeds WHERE url = $1 `, feedURL).Scan(&title, &description, &siteURL, &sourceHost) if err != nil { return nil @@ -363,7 +383,7 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo { // RefreshAllProfiles updates profiles for all existing accounts with feed URLs func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) { rows, err := c.db.Query(` - SELECT url, title, description, site_url, source_host, publish_account + SELECT url, title, description, site_url, domain_host || '.' || domain_tld as source_host, publish_account FROM feeds WHERE publish_account IS NOT NULL AND publish_account <> '' `) diff --git a/dashboard.go b/dashboard.go index 1265623..5ff21fa 100644 --- a/dashboard.go +++ b/dashboard.go @@ -92,9 +92,9 @@ func (c *Crawler) UpdateStats() { func (c *Crawler) fetchAllDomainsFromDB() []DomainStat { rows, err := c.db.Query(` - SELECT tld, source_host, COUNT(*) as cnt FROM feeds - GROUP BY tld, source_host - ORDER BY tld, source_host + SELECT domain_tld as tld, domain_host || '.' || domain_tld as source_host, COUNT(*) as cnt FROM feeds + GROUP BY domain_tld, domain_host + ORDER BY domain_tld, domain_host `) if err != nil { fmt.Printf("fetchAllDomainsFromDB error: %v\n", err) diff --git a/db.go b/db.go index e19f499..f8738ad 100644 --- a/db.go +++ b/db.go @@ -36,14 +36,17 @@ CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_t CREATE TABLE IF NOT EXISTS feeds ( url TEXT PRIMARY KEY, + domain_host TEXT NOT NULL, + domain_tld tld_enum NOT NULL, type TEXT, category TEXT DEFAULT 'main', title TEXT, description TEXT, language TEXT, site_url TEXT, + source_url TEXT, - discovered_at TIMESTAMP NOT NULL, + discovered_at TIMESTAMP NOT NULL DEFAULT NOW(), last_checked_at TIMESTAMP, -- feed_check: when last checked for new items next_check_at TIMESTAMP, -- feed_check: when to next check last_build_date TIMESTAMP, @@ -51,134 +54,67 @@ CREATE TABLE IF NOT EXISTS feeds ( etag TEXT, last_modified TEXT, - status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')), + status TEXT NOT NULL DEFAULT 'pass', last_error TEXT, last_error_at TIMESTAMP, - source_url TEXT, - source_host TEXT, - tld TEXT, - - item_count INTEGER, + item_count INTEGER NOT NULL DEFAULT 0, oldest_item_date TIMESTAMP, newest_item_date TIMESTAMP, - no_update INTEGER DEFAULT 0, + no_update INTEGER NOT NULL DEFAULT 0, -- Publishing to PDS - publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')), + publish_status TEXT NOT NULL DEFAULT 'hold', publish_account TEXT, - -- Full-text search vector - search_vector tsvector GENERATED ALWAYS AS ( - setweight(to_tsvector('english', coalesce(title, '')), 'A') || - setweight(to_tsvector('english', coalesce(description, '')), 'B') || - setweight(to_tsvector('english', coalesce(url, '')), 'C') - ) STORED + FOREIGN KEY (domain_host, domain_tld) REFERENCES domains(host, tld) ); -CREATE INDEX IF NOT EXISTS idx_feeds_source_host ON feeds(source_host); -CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status); -CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url); -CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld); -CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host); -CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops); -CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type); -CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category); -CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); -CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at); -CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); -CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector); --- idx_feeds_to_check created in migrations after column rename +-- Indexes will be added as needed based on query patterns CREATE TABLE IF NOT EXISTS items ( - id BIGSERIAL PRIMARY KEY, - feed_url TEXT NOT NULL, - guid TEXT, + guid TEXT NOT NULL, + feed_url TEXT NOT NULL REFERENCES feeds(url) ON DELETE CASCADE, title TEXT, link TEXT, description TEXT, content TEXT, author TEXT, pub_date TIMESTAMP, - discovered_at TIMESTAMP NOT NULL, + discovered_at TIMESTAMP NOT NULL DEFAULT NOW(), updated_at TIMESTAMP, -- Media attachments enclosure_url TEXT, enclosure_type TEXT, enclosure_length BIGINT, - image_urls TEXT, -- JSON array of image URLs - tags TEXT, -- JSON array of category/tag strings + image_urls JSONB, + tags JSONB, -- Publishing to PDS published_at TIMESTAMP, published_uri TEXT, - -- Full-text search vector - search_vector tsvector GENERATED ALWAYS AS ( - setweight(to_tsvector('english', coalesce(title, '')), 'A') || - setweight(to_tsvector('english', coalesce(description, '')), 'B') || - setweight(to_tsvector('english', coalesce(content, '')), 'C') || - setweight(to_tsvector('english', coalesce(author, '')), 'D') - ) STORED, - - UNIQUE(feed_url, guid) + PRIMARY KEY (guid, feed_url) ); -CREATE INDEX IF NOT EXISTS idx_items_feed_url ON items(feed_url); -CREATE INDEX IF NOT EXISTS idx_items_pub_date ON items(pub_date DESC); -CREATE INDEX IF NOT EXISTS idx_items_link ON items(link); -CREATE INDEX IF NOT EXISTS idx_items_feed_url_pub_date ON items(feed_url, pub_date DESC); -CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feed_url, published_at) WHERE published_at IS NULL; -CREATE INDEX IF NOT EXISTS idx_items_search ON items USING GIN(search_vector); +-- Indexes will be added as needed based on query patterns --- URL Shortener tables -CREATE TABLE IF NOT EXISTS short_urls ( - code TEXT PRIMARY KEY, - original_url TEXT NOT NULL, - item_id BIGINT REFERENCES items(id), - feed_url TEXT, - created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'), - click_count INTEGER DEFAULT 0 -); - -CREATE INDEX IF NOT EXISTS idx_short_urls_original ON short_urls(original_url); -CREATE INDEX IF NOT EXISTS idx_short_urls_item_id ON short_urls(item_id); -CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url); - -CREATE TABLE IF NOT EXISTS clicks ( - id BIGSERIAL PRIMARY KEY, - short_code TEXT NOT NULL REFERENCES short_urls(code), - clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'), - referrer TEXT, - user_agent TEXT, - ip_hash TEXT, - country TEXT -); - -CREATE INDEX IF NOT EXISTS idx_clicks_short_code ON clicks(short_code); -CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at DESC); - --- OAuth sessions (persisted for login persistence across deploys) -CREATE TABLE IF NOT EXISTS oauth_sessions ( +-- OAuth sessions +CREATE TABLE IF NOT EXISTS sessions ( id TEXT PRIMARY KEY, did TEXT NOT NULL, handle TEXT NOT NULL, - created_at TIMESTAMP NOT NULL, - expires_at TIMESTAMP NOT NULL, - access_token TEXT, + access_token TEXT NOT NULL, refresh_token TEXT, - token_expiry TIMESTAMP, - dpop_private_jwk TEXT, - dpop_authserver_nonce TEXT, - dpop_pds_nonce TEXT, - pds_url TEXT, - authserver_iss TEXT + token_type TEXT NOT NULL DEFAULT 'DPoP', + expires_at TIMESTAMP NOT NULL, + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + dpop_nonce TEXT, + dpop_private_jwk TEXT ); -CREATE INDEX IF NOT EXISTS idx_oauth_sessions_expires_at ON oauth_sessions(expires_at); - -- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.) CREATE OR REPLACE FUNCTION normalize_feed_url() RETURNS TRIGGER AS $$ @@ -212,8 +148,8 @@ func OpenDatabase(connString string) (*DB, error) { // Build from individual env vars host := getEnvOrDefault("DB_HOST", "atproto-postgres") port := getEnvOrDefault("DB_PORT", "5432") - user := getEnvOrDefault("DB_USER", "news_1440") - dbname := getEnvOrDefault("DB_NAME", "news_1440") + user := getEnvOrDefault("DB_USER", "dba_1440_news") + dbname := getEnvOrDefault("DB_NAME", "db_1440_news") // Support Docker secrets (password file) or direct password password := os.Getenv("DB_PASSWORD") @@ -271,7 +207,7 @@ func OpenDatabase(connString string) (*DB, error) { // Indexes must match LOWER() used in queries pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm") pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)") - pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)") + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_domain_host_trgm ON feeds USING gin (LOWER(domain_host) gin_trgm_ops)") // Migration: rename feed columns for consistent terminology // last_crawled_at -> last_checked_at (feed_check = checking feeds for new items) diff --git a/feed.go b/feed.go index 083f4fa..a1c879b 100644 --- a/feed.go +++ b/feed.go @@ -116,8 +116,8 @@ type Feed struct { // Discovery source SourceURL string `json:"source_url,omitempty"` - SourceHost string `json:"source_host,omitempty"` - TLD string `json:"tld,omitempty"` + DomainHost string `json:"domain_host,omitempty"` + DomainTLD string `json:"domain_tld,omitempty"` // Content stats ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check @@ -139,7 +139,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { // Auto-pass feeds from our own domain publishStatus := feed.PublishStatus if publishStatus == "" { - if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" { + if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" { publishStatus = "pass" } else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) { publishStatus = "skip" @@ -156,7 +156,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -188,7 +188,7 @@ func (c *Crawler) saveFeed(feed *Feed) error { feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate), NullableString(feed.ETag), NullableString(feed.LastModified), feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt), - NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD), + NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD), feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate), feed.NoUpdate, publishStatus, NullableString(feed.PublishAccount), @@ -201,7 +201,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed := &Feed{} var category, title, description, language, siteURL *string var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time - var etag, lastModified, lastError, sourceURL, sourceHost, tld *string + var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string var publishStatus, publishAccount *string var itemCount, noUpdate *int @@ -210,7 +210,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -220,7 +220,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &feed.Status, &lastError, &lastErrorAt, - &sourceURL, &sourceHost, &tld, + &sourceURL, &domainHost, &domainTLD, &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, @@ -251,8 +251,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) { feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) - feed.SourceHost = StringValue(sourceHost) - feed.TLD = StringValue(tld) + feed.DomainHost = StringValue(domainHost) + feed.DomainTLD = StringValue(domainTLD) if itemCount != nil { feed.ItemCount = *itemCount } @@ -285,7 +285,7 @@ func (c *Crawler) GetAllFeeds() ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -309,7 +309,7 @@ func (c *Crawler) GetFeedCount() (int, error) { // GetFeedCountByHost returns the number of feeds for a specific host func (c *Crawler) GetFeedCountByHost(host string) (int, error) { var count int - err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE source_host = $1", host).Scan(&count) + err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count) return count, err } @@ -320,7 +320,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -344,11 +344,11 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account - FROM feeds WHERE source_host = $1 + FROM feeds WHERE domain_host = $1 `, host) if err != nil { return nil, err @@ -366,7 +366,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -390,7 +390,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { feed := &Feed{} var feedType, category, title, description, language, siteURL *string var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time - var etag, lastModified, lastError, sourceURL, sourceHost, tld *string + var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string var itemCount, noUpdate *int var status *string var publishStatus, publishAccount *string @@ -400,7 +400,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { &feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate, &etag, &lastModified, &status, &lastError, &lastErrorAt, - &sourceURL, &sourceHost, &tld, + &sourceURL, &domainHost, &domainTLD, &itemCount, &oldestItemDate, &newestItemDate, &noUpdate, &publishStatus, &publishAccount, @@ -428,8 +428,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) { feed.LastError = StringValue(lastError) feed.LastErrorAt = TimeValue(lastErrorAt) feed.SourceURL = StringValue(sourceURL) - feed.SourceHost = StringValue(sourceHost) - feed.TLD = StringValue(tld) + feed.DomainHost = StringValue(domainHost) + feed.DomainTLD = StringValue(domainTLD) if itemCount != nil { feed.ItemCount = *itemCount } @@ -474,7 +474,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account @@ -496,7 +496,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) { discovered_at, last_checked_at, next_check_at, last_build_date, etag, last_modified, status, last_error, last_error_at, - source_url, source_host, tld, + source_url, domain_host, domain_tld, item_count, oldest_item_date, newest_item_date, no_update, publish_status, publish_account diff --git a/feed_check.go b/feed_check.go index 742430e..808b13b 100644 --- a/feed_check.go +++ b/feed_check.go @@ -33,8 +33,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea DiscoveredAt: now, LastCheckedAt: now, Status: "pass", - SourceHost: sourceHost, - TLD: getTLD(sourceHost), + DomainHost: getDomainHost(sourceHost), + DomainTLD: getTLD(sourceHost), ETag: headers.Get("ETag"), LastModified: headers.Get("Last-Modified"), } @@ -90,8 +90,8 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) { DiscoveredAt: now, Status: "pass", SourceURL: normalizeURL(sourceURL), - SourceHost: sourceHost, - TLD: getTLD(sourceHost), + DomainHost: getDomainHost(sourceHost), + DomainTLD: getTLD(sourceHost), NextCheckAt: now, // Should be crawled immediately } diff --git a/templates.go b/templates.go index dd512d1..f7b1efe 100644 --- a/templates.go +++ b/templates.go @@ -445,8 +445,8 @@ const dashboardHTML = `