Migrate to normalized FK schema (domain_host, domain_tld)

Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-02-01 22:36:25 -05:00
parent e7f6be2203
commit 7ec4207173
12 changed files with 193 additions and 214 deletions
+1 -1
View File
@@ -22,7 +22,7 @@ FROM ubuntu:latest
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && apt-get install -y ca-certificates tzdata && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y ca-certificates tzdata curl wget && rm -rf /var/lib/apt/lists/*
# Copy binary from builder
COPY --from=builder /app/1440.news .
+78 -66
View File
@@ -42,7 +42,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
case "url":
// Search feed URL paths (after domain)
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
GROUP BY tld
@@ -52,7 +52,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
case "title":
// Search feed titles
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
GROUP BY tld
@@ -62,7 +62,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
case "description":
// Search feed descriptions
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
FROM feeds
WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
GROUP BY tld
@@ -72,7 +72,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
case "item":
// Search item titles
return `
SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count
SELECT f.tld, COUNT(DISTINCT f.domain_host || '.' || f.domain_tld) as domain_count
FROM feeds f
INNER JOIN items i ON i.feed_url = f.url
WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1
@@ -85,7 +85,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
// Also include exact domain match if pattern looks like a domain
if sq.DomainHost != "" && sq.DomainTLD != "" {
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
-- Domains matching host pattern
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
@@ -95,32 +95,32 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
FROM domains WHERE LOWER(host) = $2 AND tld::text = $3
UNION
-- Feeds matching URL
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
`, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
}
return `
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
-- Domains matching host
SELECT tld::text as tld, host || '.' || tld as source_host
FROM domains WHERE LOWER(host) LIKE $1
UNION
-- Feeds matching URL
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
UNION
-- Feeds matching title
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
UNION
-- Feeds matching description
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
) combined
GROUP BY tld
ORDER BY tld ASC
@@ -335,7 +335,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
query := `
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found
FROM domains d
INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld)
INNER JOIN feeds f ON f.domain_host = d.host AND f.domain_tld = d.tld
WHERE 1=1`
args := []interface{}{}
argNum := 1
@@ -471,11 +471,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.tld = $1 AND d.status = $2
ORDER BY d.host ASC
LIMIT $3 OFFSET $4
@@ -486,11 +486,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip' AND d.tld = $1
ORDER BY d.host ASC
LIMIT $2 OFFSET $3
@@ -501,11 +501,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
@@ -515,11 +515,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
@@ -530,11 +530,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
SELECT domain_host, domain_tld, COUNT(*) as feed_count
FROM feeds
WHERE item_count > 0
GROUP BY source_host
) f ON (d.host || '.' || d.tld) = f.source_host
GROUP BY domain_host, domain_tld
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
WHERE d.status != 'skip'
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
@@ -683,10 +683,10 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
// Apply the same feed filters used for domain selection
if len(hosts) > 0 {
feedQuery := `
SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
SELECT f.domain_host || '.' || f.domain_tld as source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
FROM feeds f
WHERE f.source_host = ANY($1)`
WHERE f.domain_host || '.' || f.domain_tld = ANY($1)`
feedArgs := []interface{}{hosts}
feedArgNum := 2
@@ -740,7 +740,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
}
}
feedQuery += " ORDER BY f.source_host, f.url"
feedQuery += " ORDER BY f.domain_host, f.domain_tld, f.url"
feedRows, err := c.db.Query(feedQuery, feedArgs...)
if err == nil {
@@ -856,13 +856,17 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
fmt.Sscanf(o, "%d", &offset)
}
// Parse host into domain_host and domain_tld
domainHost := stripTLD(host)
domainTLD := getTLD(host)
rows, err := c.db.Query(`
SELECT url, title, type, status, last_error, item_count, publish_status, language
FROM feeds
WHERE source_host = $1
WHERE domain_host = $1 AND domain_tld = $2
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, host, limit, offset)
LIMIT $3 OFFSET $4
`, domainHost, domainTLD, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
@@ -1233,7 +1237,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
// If feed filter is specified, query from feeds table instead
if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" {
// Build query to get TLDs from feeds
query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL`
query := `SELECT domain_tld as tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE domain_tld IS NOT NULL`
args := []interface{}{}
argNum := 1
@@ -1310,14 +1314,14 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
case "domain":
// Search domain names
if exactMatch && tldFilter != "" {
// d:npr.org -> exact match (source_host = 'npr.org')
query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum)
// d:npr.org -> exact match
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) = $%d", argNum)
args = append(args, strings.ToLower(sq.Pattern))
} else if tldFilter != "" {
query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1)
query += fmt.Sprintf(" AND domain_tld = $%d AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum, argNum+1)
args = append(args, tldFilter, hostSearchPattern)
} else {
query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum)
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum)
args = append(args, hostSearchPattern)
}
case "url":
@@ -1338,16 +1342,16 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
if sq.DomainHost != "" && sq.DomainTLD != "" {
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
query += fmt.Sprintf(` AND (
LOWER(source_host) LIKE $%d OR
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d OR
LOWER(source_host) = $%d
LOWER(domain_host || '.' || domain_tld) = $%d
)`, argNum, argNum, argNum, argNum, argNum+1)
args = append(args, searchPattern, fullDomain)
} else {
query += fmt.Sprintf(` AND (
LOWER(source_host) LIKE $%d OR
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
LOWER(url) LIKE $%d OR
LOWER(title) LIKE $%d OR
LOWER(description) LIKE $%d
@@ -1356,7 +1360,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
}
}
}
query += " GROUP BY tld ORDER BY tld ASC"
query += " GROUP BY domain_tld ORDER BY domain_tld ASC"
rows, err = c.db.Query(query, args...)
} else if search != "" {
// Parse search prefix for type-specific searching
@@ -1441,20 +1445,20 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
// d:npr.org -> exact match for host "npr" in specified TLD
domainWhere = "tld = $1 AND lower(host) = $2"
domainArgs = []interface{}{tld, strings.ToLower(hostPart)}
feedWhere = "tld = $1 AND lower(source_host) = $2"
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) = $2"
feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)}
} else {
// d:npr -> pattern match in specified TLD
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
} else {
// Other search types - pattern match
domainWhere = "tld = $1 AND lower(host) LIKE $2"
domainArgs = []interface{}{tld, searchPattern}
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tld, searchPattern}
}
stats["search"] = search
@@ -1462,7 +1466,7 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
// Filter by TLD only
domainWhere = "tld = $1"
domainArgs = []interface{}{tld}
feedWhere = "tld = $1"
feedWhere = "domain_tld = $1"
feedArgs = []interface{}{tld}
}
@@ -1614,36 +1618,36 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
// d:npr.org -> exact match
domainWhere = "tld = $1 AND LOWER(host) = $2"
domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)}
feedWhere = "LOWER(source_host) = $1"
feedWhere = "LOWER(domain_host || '.' || domain_tld) = $1"
feedArgs = []interface{}{strings.ToLower(sq.Pattern)}
} else if tldFilter != "" {
domainWhere = "tld = $1 AND LOWER(host) LIKE $2"
domainArgs = []interface{}{tldFilter, searchPattern}
feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2"
feedWhere = "domain_tld = $1 AND LOWER(domain_host || '.' || domain_tld) LIKE $2"
feedArgs = []interface{}{tldFilter, searchPattern}
} else {
domainWhere = "LOWER(host) LIKE $1"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(source_host) LIKE $1"
feedWhere = "LOWER(domain_host || '.' || domain_tld) LIKE $1"
feedArgs = []interface{}{searchPattern}
}
case "url":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)"
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.url) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(url) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "title":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)"
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(title) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "description":
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)"
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.description) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "LOWER(description) LIKE $1"
feedArgs = []interface{}{searchPattern}
case "item":
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)"
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(i.title) LIKE $1)"
domainArgs = []interface{}{searchPattern}
feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)"
feedArgs = []interface{}{searchPattern}
@@ -1654,26 +1658,26 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
domainWhere = `(
LOWER(host) LIKE $1 OR
(LOWER(host) = $2 AND tld::text = $3) OR
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
feedWhere = `(
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(domain_host || '.' || domain_tld) = $2
)`
feedArgs = []interface{}{searchPattern, fullDomain}
} else {
domainWhere = `(
LOWER(host) LIKE $1 OR
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
))
)`
domainArgs = []interface{}{searchPattern}
feedWhere = `(
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
)`
feedArgs = []interface{}{searchPattern}
}
@@ -1834,11 +1838,13 @@ func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
func (c *Crawler) getDomainDIDs(host string) []string {
domainHost := stripTLD(host)
domainTLD := getTLD(host)
var dids []string
rows, err := c.db.Query(`
SELECT DISTINCT publish_account FROM feeds
WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != ''
`, host)
WHERE domain_host = $1 AND domain_tld = $2 AND publish_account IS NOT NULL AND publish_account != ''
`, domainHost, domainTLD)
if err == nil {
defer rows.Close()
for rows.Next() {
@@ -1871,10 +1877,12 @@ func (c *Crawler) skipDomain(host string) DomainActionResult {
}
// Mark feeds as skipped (but don't delete)
domainHost := stripTLD(host)
domainTLD := getTLD(host)
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'skip', publish_status = 'skip'
WHERE source_host = $1
`, host)
WHERE domain_host = $1 AND domain_tld = $2
`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
@@ -1942,8 +1950,10 @@ func (c *Crawler) dropDomain(host string) DomainActionResult {
}
// Get feed URLs for this domain (needed to delete items)
domainHost := stripTLD(host)
domainTLD := getTLD(host)
var feedURLs []string
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host)
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
@@ -1963,7 +1973,7 @@ func (c *Crawler) dropDomain(host string) DomainActionResult {
}
// Delete all feeds from this domain
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host)
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
return result
@@ -2031,10 +2041,12 @@ func (c *Crawler) restoreDomain(host string) DomainActionResult {
}
// Restore feeds to pass status
domainHost := stripTLD(host)
domainTLD := getTLD(host)
feedsAffected, err := c.db.Exec(`
UPDATE feeds SET status = 'pass', publish_status = 'pass'
WHERE source_host = $1
`, host)
WHERE domain_host = $1 AND domain_tld = $2
`, domainHost, domainTLD)
if err != nil {
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
return result
+11 -8
View File
@@ -154,7 +154,7 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request)
}
rows, err := c.db.Query(`
SELECT url, title, type, source_host, tld, status, last_error, item_count
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count
FROM feeds
WHERE status = $1
ORDER BY url ASC
@@ -218,7 +218,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
var err error
if publishStatus != "" {
rows, err = c.db.Query(`
SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language
FROM feeds
WHERE publish_status = $1
ORDER BY url ASC
@@ -226,7 +226,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
`, publishStatus, limit, offset)
} else {
rows, err = c.db.Query(`
SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language
FROM feeds
ORDER BY url ASC
LIMIT $1 OFFSET $2
@@ -279,19 +279,22 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string,
var args []interface{}
argNum := 1
query := `
SELECT url, title, type, category, source_host, tld, status, last_error, item_count, language
SELECT url, title, type, category, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, language
FROM feeds
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND tld = $%d", argNum)
query += fmt.Sprintf(" AND domain_tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if domain != "" {
query += fmt.Sprintf(" AND source_host = $%d", argNum)
args = append(args, domain)
argNum++
// Parse domain into host and tld parts
domainHost := stripTLD(domain)
domainTLD := getTLD(domain)
query += fmt.Sprintf(" AND domain_host = $%d AND domain_tld = $%d", argNum, argNum+1)
args = append(args, domainHost, domainTLD)
argNum += 2
}
if status != "" {
query += fmt.Sprintf(" AND status = $%d", argNum)
+8 -6
View File
@@ -150,7 +150,7 @@ func (c *Crawler) handleAPIPublishDenied(w http.ResponseWriter, r *http.Request)
result = append(result, FeedDeniedInfo{
URL: f.URL,
Title: f.Title,
SourceHost: f.SourceHost,
SourceHost: fullHost(f.DomainHost, f.DomainTLD),
})
}
@@ -193,7 +193,7 @@ func (c *Crawler) handleAPIPublishCandidates(w http.ResponseWriter, r *http.Requ
URL: f.URL,
Title: f.Title,
Category: f.Category,
SourceHost: f.SourceHost,
SourceHost: fullHost(f.DomainHost, f.DomainTLD),
ItemCount: f.ItemCount,
DerivedHandle: DeriveHandleFromFeed(f.URL),
})
@@ -346,9 +346,10 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error)
// Set up profile
feed, _ := c.getFeed(feedURL)
if feed != nil {
sourceHost := fullHost(feed.DomainHost, feed.DomainTLD)
displayName := feed.Title
if displayName == "" {
displayName = feed.SourceHost
displayName = sourceHost
}
description := feed.Description
if description == "" {
@@ -366,7 +367,7 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error)
// Try to fetch favicon
var avatar *BlobRef
faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost)
faviconData, mimeType, err := FetchFaviconBytes(sourceHost)
if err == nil && len(faviconData) > 0 {
avatar, _ = publisher.UploadBlob(session, faviconData, mimeType)
}
@@ -819,15 +820,16 @@ func (c *Crawler) handleAPIPublishFeedFull(w http.ResponseWriter, r *http.Reques
fmt.Printf("Created account: %s (%s)\n", session.Handle, session.DID)
// Set up profile with feed title and favicon
sourceHost := fullHost(feed.DomainHost, feed.DomainTLD)
displayName := feed.Title
if displayName == "" {
displayName = feed.SourceHost
displayName = sourceHost
}
description := feed.Description
// Try to fetch favicon for avatar
var avatar *BlobRef
faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost)
faviconData, mimeType, err := FetchFaviconBytes(sourceHost)
if err == nil && len(faviconData) > 0 {
avatar, err = publisher.UploadBlob(session, faviconData, mimeType)
if err != nil {
+5 -5
View File
@@ -138,17 +138,17 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
return url, sf, true
}
// Search feeds by source_host (LIKE search for domain matching)
// Search feeds by domain_host (LIKE search for domain matching)
// Use LOWER() to leverage trigram index
lowerPattern := "%" + strings.ToLower(query) + "%"
hostRows, err := c.db.Query(`
SELECT url, type, category, title, description, language, site_url,
discovered_at, last_checked_at, next_check_at, last_build_date,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
item_count, oldest_item_date, newest_item_date, no_update
FROM feeds
WHERE LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1
WHERE LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1
LIMIT $2
`, lowerPattern, limit)
if err == nil {
@@ -168,7 +168,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
SELECT url, type, category, title, description, language, site_url,
discovered_at, last_checked_at, next_check_at, last_build_date,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
item_count, oldest_item_date, newest_item_date, no_update
FROM feeds
WHERE search_vector @@ to_tsquery('english', $1)
@@ -243,7 +243,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
SELECT type, category, title, description, language, site_url,
discovered_at, last_checked_at, next_check_at, last_build_date,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
item_count, oldest_item_date, newest_item_date, no_update
FROM feeds WHERE url = $1
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
+25 -5
View File
@@ -2,6 +2,7 @@ package main
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
@@ -42,16 +43,35 @@ func NewCrawler(connString string) (*Crawler, error) {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Custom transport with longer timeouts (HTTP/2 disabled for compatibility)
transport := &http.Transport{
TLSClientConfig: &tls.Config{
MinVersion: tls.VersionTLS12,
NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility
},
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
ForceAttemptHTTP2: false,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 30 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
ResponseHeaderTimeout: 60 * time.Second,
}
return &Crawler{
MaxDepth: 10,
MaxPagesPerHost: 10,
Timeout: 10 * time.Second,
UserAgent: "FeedCrawler/1.0",
Timeout: 60 * time.Second,
UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)",
startTime: time.Now(),
db: db,
shutdownCh: make(chan struct{}),
client: &http.Client{
Timeout: 10 * time.Second,
Timeout: 60 * time.Second,
Transport: transport,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("stopped after 10 redirects")
@@ -347,7 +367,7 @@ type FeedInfo struct {
func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
var title, description, siteURL, sourceHost *string
err := c.db.QueryRow(`
SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
SELECT title, description, site_url, domain_host || '.' || domain_tld as source_host FROM feeds WHERE url = $1
`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
if err != nil {
return nil
@@ -363,7 +383,7 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
// RefreshAllProfiles updates profiles for all existing accounts with feed URLs
func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
rows, err := c.db.Query(`
SELECT url, title, description, site_url, source_host, publish_account
SELECT url, title, description, site_url, domain_host || '.' || domain_tld as source_host, publish_account
FROM feeds
WHERE publish_account IS NOT NULL AND publish_account <> ''
`)
+3 -3
View File
@@ -92,9 +92,9 @@ func (c *Crawler) UpdateStats() {
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
rows, err := c.db.Query(`
SELECT tld, source_host, COUNT(*) as cnt FROM feeds
GROUP BY tld, source_host
ORDER BY tld, source_host
SELECT domain_tld as tld, domain_host || '.' || domain_tld as source_host, COUNT(*) as cnt FROM feeds
GROUP BY domain_tld, domain_host
ORDER BY domain_tld, domain_host
`)
if err != nil {
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
+28 -92
View File
@@ -36,14 +36,17 @@ CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_t
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
domain_host TEXT NOT NULL,
domain_tld tld_enum NOT NULL,
type TEXT,
category TEXT DEFAULT 'main',
title TEXT,
description TEXT,
language TEXT,
site_url TEXT,
source_url TEXT,
discovered_at TIMESTAMP NOT NULL,
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
last_checked_at TIMESTAMP, -- feed_check: when last checked for new items
next_check_at TIMESTAMP, -- feed_check: when to next check
last_build_date TIMESTAMP,
@@ -51,134 +54,67 @@ CREATE TABLE IF NOT EXISTS feeds (
etag TEXT,
last_modified TEXT,
status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')),
status TEXT NOT NULL DEFAULT 'pass',
last_error TEXT,
last_error_at TIMESTAMP,
source_url TEXT,
source_host TEXT,
tld TEXT,
item_count INTEGER,
item_count INTEGER NOT NULL DEFAULT 0,
oldest_item_date TIMESTAMP,
newest_item_date TIMESTAMP,
no_update INTEGER DEFAULT 0,
no_update INTEGER NOT NULL DEFAULT 0,
-- Publishing to PDS
publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')),
publish_status TEXT NOT NULL DEFAULT 'hold',
publish_account TEXT,
-- Full-text search vector
search_vector tsvector GENERATED ALWAYS AS (
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
setweight(to_tsvector('english', coalesce(url, '')), 'C')
) STORED
FOREIGN KEY (domain_host, domain_tld) REFERENCES domains(host, tld)
);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host ON feeds(source_host);
CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector);
-- idx_feeds_to_check created in migrations after column rename
-- Indexes will be added as needed based on query patterns
CREATE TABLE IF NOT EXISTS items (
id BIGSERIAL PRIMARY KEY,
feed_url TEXT NOT NULL,
guid TEXT,
guid TEXT NOT NULL,
feed_url TEXT NOT NULL REFERENCES feeds(url) ON DELETE CASCADE,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pub_date TIMESTAMP,
discovered_at TIMESTAMP NOT NULL,
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP,
-- Media attachments
enclosure_url TEXT,
enclosure_type TEXT,
enclosure_length BIGINT,
image_urls TEXT, -- JSON array of image URLs
tags TEXT, -- JSON array of category/tag strings
image_urls JSONB,
tags JSONB,
-- Publishing to PDS
published_at TIMESTAMP,
published_uri TEXT,
-- Full-text search vector
search_vector tsvector GENERATED ALWAYS AS (
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
setweight(to_tsvector('english', coalesce(content, '')), 'C') ||
setweight(to_tsvector('english', coalesce(author, '')), 'D')
) STORED,
UNIQUE(feed_url, guid)
PRIMARY KEY (guid, feed_url)
);
CREATE INDEX IF NOT EXISTS idx_items_feed_url ON items(feed_url);
CREATE INDEX IF NOT EXISTS idx_items_pub_date ON items(pub_date DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feed_url_pub_date ON items(feed_url, pub_date DESC);
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feed_url, published_at) WHERE published_at IS NULL;
CREATE INDEX IF NOT EXISTS idx_items_search ON items USING GIN(search_vector);
-- Indexes will be added as needed based on query patterns
-- URL Shortener tables
CREATE TABLE IF NOT EXISTS short_urls (
code TEXT PRIMARY KEY,
original_url TEXT NOT NULL,
item_id BIGINT REFERENCES items(id),
feed_url TEXT,
created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
click_count INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_short_urls_original ON short_urls(original_url);
CREATE INDEX IF NOT EXISTS idx_short_urls_item_id ON short_urls(item_id);
CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url);
CREATE TABLE IF NOT EXISTS clicks (
id BIGSERIAL PRIMARY KEY,
short_code TEXT NOT NULL REFERENCES short_urls(code),
clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
referrer TEXT,
user_agent TEXT,
ip_hash TEXT,
country TEXT
);
CREATE INDEX IF NOT EXISTS idx_clicks_short_code ON clicks(short_code);
CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at DESC);
-- OAuth sessions (persisted for login persistence across deploys)
CREATE TABLE IF NOT EXISTS oauth_sessions (
-- OAuth sessions
CREATE TABLE IF NOT EXISTS sessions (
id TEXT PRIMARY KEY,
did TEXT NOT NULL,
handle TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
expires_at TIMESTAMP NOT NULL,
access_token TEXT,
access_token TEXT NOT NULL,
refresh_token TEXT,
token_expiry TIMESTAMP,
dpop_private_jwk TEXT,
dpop_authserver_nonce TEXT,
dpop_pds_nonce TEXT,
pds_url TEXT,
authserver_iss TEXT
token_type TEXT NOT NULL DEFAULT 'DPoP',
expires_at TIMESTAMP NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
dpop_nonce TEXT,
dpop_private_jwk TEXT
);
CREATE INDEX IF NOT EXISTS idx_oauth_sessions_expires_at ON oauth_sessions(expires_at);
-- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.)
CREATE OR REPLACE FUNCTION normalize_feed_url()
RETURNS TRIGGER AS $$
@@ -212,8 +148,8 @@ func OpenDatabase(connString string) (*DB, error) {
// Build from individual env vars
host := getEnvOrDefault("DB_HOST", "atproto-postgres")
port := getEnvOrDefault("DB_PORT", "5432")
user := getEnvOrDefault("DB_USER", "news_1440")
dbname := getEnvOrDefault("DB_NAME", "news_1440")
user := getEnvOrDefault("DB_USER", "dba_1440_news")
dbname := getEnvOrDefault("DB_NAME", "db_1440_news")
// Support Docker secrets (password file) or direct password
password := os.Getenv("DB_PASSWORD")
@@ -271,7 +207,7 @@ func OpenDatabase(connString string) (*DB, error) {
// Indexes must match LOWER() used in queries
pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_domain_host_trgm ON feeds USING gin (LOWER(domain_host) gin_trgm_ops)")
// Migration: rename feed columns for consistent terminology
// last_crawled_at -> last_checked_at (feed_check = checking feeds for new items)
+22 -22
View File
@@ -116,8 +116,8 @@ type Feed struct {
// Discovery source
SourceURL string `json:"source_url,omitempty"`
SourceHost string `json:"source_host,omitempty"`
TLD string `json:"tld,omitempty"`
DomainHost string `json:"domain_host,omitempty"`
DomainTLD string `json:"domain_tld,omitempty"`
// Content stats
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
@@ -139,7 +139,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
// Auto-pass feeds from our own domain
publishStatus := feed.PublishStatus
if publishStatus == "" {
if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" {
publishStatus = "pass"
} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
publishStatus = "skip"
@@ -156,7 +156,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -188,7 +188,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate),
NullableString(feed.ETag), NullableString(feed.LastModified),
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD),
feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate),
feed.NoUpdate,
publishStatus, NullableString(feed.PublishAccount),
@@ -201,7 +201,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed := &Feed{}
var category, title, description, language, siteURL *string
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
var publishStatus, publishAccount *string
var itemCount, noUpdate *int
@@ -210,7 +210,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -220,7 +220,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
&etag, &lastModified,
&feed.Status, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&sourceURL, &domainHost, &domainTLD,
&itemCount, &oldestItemDate, &newestItemDate,
&noUpdate,
&publishStatus, &publishAccount,
@@ -251,8 +251,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed.LastError = StringValue(lastError)
feed.LastErrorAt = TimeValue(lastErrorAt)
feed.SourceURL = StringValue(sourceURL)
feed.SourceHost = StringValue(sourceHost)
feed.TLD = StringValue(tld)
feed.DomainHost = StringValue(domainHost)
feed.DomainTLD = StringValue(domainTLD)
if itemCount != nil {
feed.ItemCount = *itemCount
}
@@ -285,7 +285,7 @@ func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -309,7 +309,7 @@ func (c *Crawler) GetFeedCount() (int, error) {
// GetFeedCountByHost returns the number of feeds for a specific host
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE source_host = $1", host).Scan(&count)
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count)
return count, err
}
@@ -320,7 +320,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -344,11 +344,11 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
FROM feeds WHERE source_host = $1
FROM feeds WHERE domain_host = $1
`, host)
if err != nil {
return nil, err
@@ -366,7 +366,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -390,7 +390,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
feed := &Feed{}
var feedType, category, title, description, language, siteURL *string
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
var itemCount, noUpdate *int
var status *string
var publishStatus, publishAccount *string
@@ -400,7 +400,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
&etag, &lastModified,
&status, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&sourceURL, &domainHost, &domainTLD,
&itemCount, &oldestItemDate, &newestItemDate,
&noUpdate,
&publishStatus, &publishAccount,
@@ -428,8 +428,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
feed.LastError = StringValue(lastError)
feed.LastErrorAt = TimeValue(lastErrorAt)
feed.SourceURL = StringValue(sourceURL)
feed.SourceHost = StringValue(sourceHost)
feed.TLD = StringValue(tld)
feed.DomainHost = StringValue(domainHost)
feed.DomainTLD = StringValue(domainTLD)
if itemCount != nil {
feed.ItemCount = *itemCount
}
@@ -474,7 +474,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -496,7 +496,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
+4 -4
View File
@@ -33,8 +33,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
DiscoveredAt: now,
LastCheckedAt: now,
Status: "pass",
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
DomainHost: getDomainHost(sourceHost),
DomainTLD: getTLD(sourceHost),
ETag: headers.Get("ETag"),
LastModified: headers.Get("Last-Modified"),
}
@@ -90,8 +90,8 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
DiscoveredAt: now,
Status: "pass",
SourceURL: normalizeURL(sourceURL),
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
DomainHost: getDomainHost(sourceHost),
DomainTLD: getTLD(sourceHost),
NextCheckAt: now, // Should be crawled immediately
}
+2 -2
View File
@@ -445,8 +445,8 @@ const dashboardHTML = `<!DOCTYPE html>
<title>1440.news Feed Crawler</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="/static/dashboard.css?v=1769990750">
<script src="/static/dashboard.js?v=1769990750"></script>
<link rel="stylesheet" href="/static/dashboard.css?v=1769995130">
<script src="/static/dashboard.js?v=1769995130"></script>
</head>
<body>
<div id="topSection">
+6
View File
@@ -61,6 +61,12 @@ func stripTLD(host string) string {
return host
}
// getDomainHost extracts the host part from a full domain (without TLD)
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
func getDomainHost(domain string) string {
return stripTLD(domain)
}
// fullHost reconstructs the full hostname from host and tld
// e.g., ("example", "com") -> "example.com"
func fullHost(host, tld string) string {