Migrate to normalized FK schema (domain_host, domain_tld)
Replace source_host column with proper FK to domains table using composite key (domain_host, domain_tld). This enables JOIN queries instead of string concatenation for domain lookups. Changes: - Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD - Update all SQL queries to use domain_host/domain_tld columns - Add column aliases (as source_host) for API backwards compatibility - Update trigram index from source_host to domain_host - Add getDomainHost() helper for extracting host from domain Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+1
-1
@@ -22,7 +22,7 @@ FROM ubuntu:latest
|
||||
WORKDIR /app
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y ca-certificates tzdata && rm -rf /var/lib/apt/lists/*
|
||||
RUN apt-get update && apt-get install -y ca-certificates tzdata curl wget && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy binary from builder
|
||||
COPY --from=builder /app/1440.news .
|
||||
|
||||
+78
-66
@@ -42,7 +42,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
case "url":
|
||||
// Search feed URL paths (after domain)
|
||||
return `
|
||||
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
||||
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
|
||||
FROM feeds
|
||||
WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
||||
GROUP BY tld
|
||||
@@ -52,7 +52,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
case "title":
|
||||
// Search feed titles
|
||||
return `
|
||||
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
||||
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
|
||||
FROM feeds
|
||||
WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
||||
GROUP BY tld
|
||||
@@ -62,7 +62,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
case "description":
|
||||
// Search feed descriptions
|
||||
return `
|
||||
SELECT tld, COUNT(DISTINCT source_host) as domain_count
|
||||
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count
|
||||
FROM feeds
|
||||
WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
||||
GROUP BY tld
|
||||
@@ -72,7 +72,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
case "item":
|
||||
// Search item titles
|
||||
return `
|
||||
SELECT f.tld, COUNT(DISTINCT f.source_host) as domain_count
|
||||
SELECT f.tld, COUNT(DISTINCT f.domain_host || '.' || f.domain_tld) as domain_count
|
||||
FROM feeds f
|
||||
INNER JOIN items i ON i.feed_url = f.url
|
||||
WHERE f.tld IS NOT NULL AND LOWER(i.title) LIKE $1
|
||||
@@ -85,7 +85,7 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
// Also include exact domain match if pattern looks like a domain
|
||||
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
||||
return `
|
||||
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
|
||||
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
|
||||
-- Domains matching host pattern
|
||||
SELECT tld::text as tld, host || '.' || tld as source_host
|
||||
FROM domains WHERE LOWER(host) LIKE $1
|
||||
@@ -95,32 +95,32 @@ func buildTLDSearchQuery(sq SearchQuery) (string, []interface{}) {
|
||||
FROM domains WHERE LOWER(host) = $2 AND tld::text = $3
|
||||
UNION
|
||||
-- Feeds matching URL
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
|
||||
UNION
|
||||
-- Feeds matching title
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
|
||||
UNION
|
||||
-- Feeds matching description
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
|
||||
) combined
|
||||
GROUP BY tld
|
||||
ORDER BY tld ASC
|
||||
`, []interface{}{pattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
||||
}
|
||||
return `
|
||||
SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM (
|
||||
SELECT tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM (
|
||||
-- Domains matching host
|
||||
SELECT tld::text as tld, host || '.' || tld as source_host
|
||||
FROM domains WHERE LOWER(host) LIKE $1
|
||||
UNION
|
||||
-- Feeds matching URL
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(url) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(url) LIKE $1
|
||||
UNION
|
||||
-- Feeds matching title
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(title) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(title) LIKE $1
|
||||
UNION
|
||||
-- Feeds matching description
|
||||
SELECT tld, source_host FROM feeds WHERE tld IS NOT NULL AND LOWER(description) LIKE $1
|
||||
SELECT domain_tld::text as tld, domain_host || '.' || domain_tld as source_host FROM feeds WHERE domain_tld IS NOT NULL AND LOWER(description) LIKE $1
|
||||
) combined
|
||||
GROUP BY tld
|
||||
ORDER BY tld ASC
|
||||
@@ -335,7 +335,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
query := `
|
||||
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, d.feeds_found
|
||||
FROM domains d
|
||||
INNER JOIN feeds f ON f.source_host = (d.host || '.' || d.tld)
|
||||
INNER JOIN feeds f ON f.domain_host = d.host AND f.domain_tld = d.tld
|
||||
WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argNum := 1
|
||||
@@ -471,11 +471,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
SELECT domain_host, domain_tld, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
WHERE item_count > 0
|
||||
GROUP BY source_host
|
||||
) f ON (d.host || '.' || d.tld) = f.source_host
|
||||
GROUP BY domain_host, domain_tld
|
||||
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
|
||||
WHERE d.tld = $1 AND d.status = $2
|
||||
ORDER BY d.host ASC
|
||||
LIMIT $3 OFFSET $4
|
||||
@@ -486,11 +486,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
SELECT domain_host, domain_tld, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
WHERE item_count > 0
|
||||
GROUP BY source_host
|
||||
) f ON (d.host || '.' || d.tld) = f.source_host
|
||||
GROUP BY domain_host, domain_tld
|
||||
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
|
||||
WHERE d.status != 'skip' AND d.tld = $1
|
||||
ORDER BY d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
@@ -501,11 +501,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
SELECT domain_host, domain_tld, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
WHERE item_count > 0
|
||||
GROUP BY source_host
|
||||
) f ON (d.host || '.' || d.tld) = f.source_host
|
||||
GROUP BY domain_host, domain_tld
|
||||
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
|
||||
WHERE d.status != 'skip' AND LOWER(d.host) LIKE $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
@@ -515,11 +515,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
SELECT domain_host, domain_tld, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
WHERE item_count > 0
|
||||
GROUP BY source_host
|
||||
) f ON (d.host || '.' || d.tld) = f.source_host
|
||||
GROUP BY domain_host, domain_tld
|
||||
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
|
||||
WHERE d.status = $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
@@ -530,11 +530,11 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
SELECT domain_host, domain_tld, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
WHERE item_count > 0
|
||||
GROUP BY source_host
|
||||
) f ON (d.host || '.' || d.tld) = f.source_host
|
||||
GROUP BY domain_host, domain_tld
|
||||
) f ON d.host = f.domain_host AND d.tld = f.domain_tld
|
||||
WHERE d.status != 'skip'
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $1 OFFSET $2
|
||||
@@ -683,10 +683,10 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
// Apply the same feed filters used for domain selection
|
||||
if len(hosts) > 0 {
|
||||
feedQuery := `
|
||||
SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
|
||||
SELECT f.domain_host || '.' || f.domain_tld as source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
|
||||
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
|
||||
FROM feeds f
|
||||
WHERE f.source_host = ANY($1)`
|
||||
WHERE f.domain_host || '.' || f.domain_tld = ANY($1)`
|
||||
feedArgs := []interface{}{hosts}
|
||||
feedArgNum := 2
|
||||
|
||||
@@ -740,7 +740,7 @@ func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
feedQuery += " ORDER BY f.source_host, f.url"
|
||||
feedQuery += " ORDER BY f.domain_host, f.domain_tld, f.url"
|
||||
|
||||
feedRows, err := c.db.Query(feedQuery, feedArgs...)
|
||||
if err == nil {
|
||||
@@ -856,13 +856,17 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
// Parse host into domain_host and domain_tld
|
||||
domainHost := stripTLD(host)
|
||||
domainTLD := getTLD(host)
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, type, status, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
WHERE source_host = $1
|
||||
WHERE domain_host = $1 AND domain_tld = $2
|
||||
ORDER BY url ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, host, limit, offset)
|
||||
LIMIT $3 OFFSET $4
|
||||
`, domainHost, domainTLD, limit, offset)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -1233,7 +1237,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
||||
// If feed filter is specified, query from feeds table instead
|
||||
if len(statusList) > 0 || len(typeList) > 0 || feedMode == "exclude" {
|
||||
// Build query to get TLDs from feeds
|
||||
query := `SELECT tld, COUNT(DISTINCT source_host) as domain_count FROM feeds WHERE tld IS NOT NULL`
|
||||
query := `SELECT domain_tld as tld, COUNT(DISTINCT domain_host || '.' || domain_tld) as domain_count FROM feeds WHERE domain_tld IS NOT NULL`
|
||||
args := []interface{}{}
|
||||
argNum := 1
|
||||
|
||||
@@ -1310,14 +1314,14 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
||||
case "domain":
|
||||
// Search domain names
|
||||
if exactMatch && tldFilter != "" {
|
||||
// d:npr.org -> exact match (source_host = 'npr.org')
|
||||
query += fmt.Sprintf(" AND LOWER(source_host) = $%d", argNum)
|
||||
// d:npr.org -> exact match
|
||||
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) = $%d", argNum)
|
||||
args = append(args, strings.ToLower(sq.Pattern))
|
||||
} else if tldFilter != "" {
|
||||
query += fmt.Sprintf(" AND tld = $%d AND LOWER(source_host) LIKE $%d", argNum, argNum+1)
|
||||
query += fmt.Sprintf(" AND domain_tld = $%d AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum, argNum+1)
|
||||
args = append(args, tldFilter, hostSearchPattern)
|
||||
} else {
|
||||
query += fmt.Sprintf(" AND LOWER(source_host) LIKE $%d", argNum)
|
||||
query += fmt.Sprintf(" AND LOWER(domain_host || '.' || domain_tld) LIKE $%d", argNum)
|
||||
args = append(args, hostSearchPattern)
|
||||
}
|
||||
case "url":
|
||||
@@ -1338,16 +1342,16 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
||||
if sq.DomainHost != "" && sq.DomainTLD != "" {
|
||||
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
|
||||
query += fmt.Sprintf(` AND (
|
||||
LOWER(source_host) LIKE $%d OR
|
||||
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
|
||||
LOWER(url) LIKE $%d OR
|
||||
LOWER(title) LIKE $%d OR
|
||||
LOWER(description) LIKE $%d OR
|
||||
LOWER(source_host) = $%d
|
||||
LOWER(domain_host || '.' || domain_tld) = $%d
|
||||
)`, argNum, argNum, argNum, argNum, argNum+1)
|
||||
args = append(args, searchPattern, fullDomain)
|
||||
} else {
|
||||
query += fmt.Sprintf(` AND (
|
||||
LOWER(source_host) LIKE $%d OR
|
||||
LOWER(domain_host || '.' || domain_tld) LIKE $%d OR
|
||||
LOWER(url) LIKE $%d OR
|
||||
LOWER(title) LIKE $%d OR
|
||||
LOWER(description) LIKE $%d
|
||||
@@ -1356,7 +1360,7 @@ func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
}
|
||||
query += " GROUP BY tld ORDER BY tld ASC"
|
||||
query += " GROUP BY domain_tld ORDER BY domain_tld ASC"
|
||||
rows, err = c.db.Query(query, args...)
|
||||
} else if search != "" {
|
||||
// Parse search prefix for type-specific searching
|
||||
@@ -1441,20 +1445,20 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
|
||||
// d:npr.org -> exact match for host "npr" in specified TLD
|
||||
domainWhere = "tld = $1 AND lower(host) = $2"
|
||||
domainArgs = []interface{}{tld, strings.ToLower(hostPart)}
|
||||
feedWhere = "tld = $1 AND lower(source_host) = $2"
|
||||
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) = $2"
|
||||
feedArgs = []interface{}{tld, strings.ToLower(sq.Pattern)}
|
||||
} else {
|
||||
// d:npr -> pattern match in specified TLD
|
||||
domainWhere = "tld = $1 AND lower(host) LIKE $2"
|
||||
domainArgs = []interface{}{tld, searchPattern}
|
||||
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
|
||||
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
|
||||
feedArgs = []interface{}{tld, searchPattern}
|
||||
}
|
||||
} else {
|
||||
// Other search types - pattern match
|
||||
domainWhere = "tld = $1 AND lower(host) LIKE $2"
|
||||
domainArgs = []interface{}{tld, searchPattern}
|
||||
feedWhere = "tld = $1 AND lower(source_host) LIKE $2"
|
||||
feedWhere = "domain_tld = $1 AND lower(domain_host || '.' || domain_tld) LIKE $2"
|
||||
feedArgs = []interface{}{tld, searchPattern}
|
||||
}
|
||||
stats["search"] = search
|
||||
@@ -1462,7 +1466,7 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
|
||||
// Filter by TLD only
|
||||
domainWhere = "tld = $1"
|
||||
domainArgs = []interface{}{tld}
|
||||
feedWhere = "tld = $1"
|
||||
feedWhere = "domain_tld = $1"
|
||||
feedArgs = []interface{}{tld}
|
||||
}
|
||||
|
||||
@@ -1614,36 +1618,36 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
|
||||
// d:npr.org -> exact match
|
||||
domainWhere = "tld = $1 AND LOWER(host) = $2"
|
||||
domainArgs = []interface{}{tldFilter, strings.ToLower(hostPart)}
|
||||
feedWhere = "LOWER(source_host) = $1"
|
||||
feedWhere = "LOWER(domain_host || '.' || domain_tld) = $1"
|
||||
feedArgs = []interface{}{strings.ToLower(sq.Pattern)}
|
||||
} else if tldFilter != "" {
|
||||
domainWhere = "tld = $1 AND LOWER(host) LIKE $2"
|
||||
domainArgs = []interface{}{tldFilter, searchPattern}
|
||||
feedWhere = "tld = $1 AND LOWER(source_host) LIKE $2"
|
||||
feedWhere = "domain_tld = $1 AND LOWER(domain_host || '.' || domain_tld) LIKE $2"
|
||||
feedArgs = []interface{}{tldFilter, searchPattern}
|
||||
} else {
|
||||
domainWhere = "LOWER(host) LIKE $1"
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = "LOWER(source_host) LIKE $1"
|
||||
feedWhere = "LOWER(domain_host || '.' || domain_tld) LIKE $1"
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
}
|
||||
case "url":
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.url) LIKE $1)"
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.url) LIKE $1)"
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = "LOWER(url) LIKE $1"
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
case "title":
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.title) LIKE $1)"
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.title) LIKE $1)"
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = "LOWER(title) LIKE $1"
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
case "description":
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND LOWER(f.description) LIKE $1)"
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(f.description) LIKE $1)"
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = "LOWER(description) LIKE $1"
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
case "item":
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.source_host = (host || '.' || tld) AND LOWER(i.title) LIKE $1)"
|
||||
domainWhere = "EXISTS (SELECT 1 FROM feeds f INNER JOIN items i ON i.feed_url = f.url WHERE f.domain_host = host AND f.domain_tld = tld AND LOWER(i.title) LIKE $1)"
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = "EXISTS (SELECT 1 FROM items i WHERE i.feed_url = url AND LOWER(i.title) LIKE $1)"
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
@@ -1654,26 +1658,26 @@ func (c *Crawler) handleAPISearchStats(w http.ResponseWriter, r *http.Request) {
|
||||
domainWhere = `(
|
||||
LOWER(host) LIKE $1 OR
|
||||
(LOWER(host) = $2 AND tld::text = $3) OR
|
||||
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
|
||||
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
|
||||
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
|
||||
))
|
||||
)`
|
||||
domainArgs = []interface{}{searchPattern, strings.ToLower(sq.DomainHost), strings.ToLower(sq.DomainTLD)}
|
||||
fullDomain := strings.ToLower(sq.DomainHost + "." + sq.DomainTLD)
|
||||
feedWhere = `(
|
||||
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(source_host) = $2
|
||||
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1 OR LOWER(domain_host || '.' || domain_tld) = $2
|
||||
)`
|
||||
feedArgs = []interface{}{searchPattern, fullDomain}
|
||||
} else {
|
||||
domainWhere = `(
|
||||
LOWER(host) LIKE $1 OR
|
||||
EXISTS (SELECT 1 FROM feeds f WHERE f.source_host = (host || '.' || tld) AND (
|
||||
EXISTS (SELECT 1 FROM feeds f WHERE f.domain_host = host AND f.domain_tld = tld AND (
|
||||
LOWER(f.url) LIKE $1 OR LOWER(f.title) LIKE $1 OR LOWER(f.description) LIKE $1
|
||||
))
|
||||
)`
|
||||
domainArgs = []interface{}{searchPattern}
|
||||
feedWhere = `(
|
||||
LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
|
||||
LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1 OR LOWER(title) LIKE $1 OR LOWER(description) LIKE $1
|
||||
)`
|
||||
feedArgs = []interface{}{searchPattern}
|
||||
}
|
||||
@@ -1834,11 +1838,13 @@ func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
|
||||
|
||||
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
|
||||
func (c *Crawler) getDomainDIDs(host string) []string {
|
||||
domainHost := stripTLD(host)
|
||||
domainTLD := getTLD(host)
|
||||
var dids []string
|
||||
rows, err := c.db.Query(`
|
||||
SELECT DISTINCT publish_account FROM feeds
|
||||
WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != ''
|
||||
`, host)
|
||||
WHERE domain_host = $1 AND domain_tld = $2 AND publish_account IS NOT NULL AND publish_account != ''
|
||||
`, domainHost, domainTLD)
|
||||
if err == nil {
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
@@ -1871,10 +1877,12 @@ func (c *Crawler) skipDomain(host string) DomainActionResult {
|
||||
}
|
||||
|
||||
// Mark feeds as skipped (but don't delete)
|
||||
domainHost := stripTLD(host)
|
||||
domainTLD := getTLD(host)
|
||||
feedsAffected, err := c.db.Exec(`
|
||||
UPDATE feeds SET status = 'skip', publish_status = 'skip'
|
||||
WHERE source_host = $1
|
||||
`, host)
|
||||
WHERE domain_host = $1 AND domain_tld = $2
|
||||
`, domainHost, domainTLD)
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
||||
return result
|
||||
@@ -1942,8 +1950,10 @@ func (c *Crawler) dropDomain(host string) DomainActionResult {
|
||||
}
|
||||
|
||||
// Get feed URLs for this domain (needed to delete items)
|
||||
domainHost := stripTLD(host)
|
||||
domainTLD := getTLD(host)
|
||||
var feedURLs []string
|
||||
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host)
|
||||
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
|
||||
if err == nil {
|
||||
defer feedRows.Close()
|
||||
for feedRows.Next() {
|
||||
@@ -1963,7 +1973,7 @@ func (c *Crawler) dropDomain(host string) DomainActionResult {
|
||||
}
|
||||
|
||||
// Delete all feeds from this domain
|
||||
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host)
|
||||
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE domain_host = $1 AND domain_tld = $2`, domainHost, domainTLD)
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
|
||||
return result
|
||||
@@ -2031,10 +2041,12 @@ func (c *Crawler) restoreDomain(host string) DomainActionResult {
|
||||
}
|
||||
|
||||
// Restore feeds to pass status
|
||||
domainHost := stripTLD(host)
|
||||
domainTLD := getTLD(host)
|
||||
feedsAffected, err := c.db.Exec(`
|
||||
UPDATE feeds SET status = 'pass', publish_status = 'pass'
|
||||
WHERE source_host = $1
|
||||
`, host)
|
||||
WHERE domain_host = $1 AND domain_tld = $2
|
||||
`, domainHost, domainTLD)
|
||||
if err != nil {
|
||||
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
||||
return result
|
||||
|
||||
+11
-8
@@ -154,7 +154,7 @@ func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, last_error, item_count
|
||||
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count
|
||||
FROM feeds
|
||||
WHERE status = $1
|
||||
ORDER BY url ASC
|
||||
@@ -218,7 +218,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
|
||||
var err error
|
||||
if publishStatus != "" {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language
|
||||
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
WHERE publish_status = $1
|
||||
ORDER BY url ASC
|
||||
@@ -226,7 +226,7 @@ func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
|
||||
`, publishStatus, limit, offset)
|
||||
} else {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, last_error, item_count, publish_status, language
|
||||
SELECT url, title, type, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
ORDER BY url ASC
|
||||
LIMIT $1 OFFSET $2
|
||||
@@ -279,19 +279,22 @@ func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string,
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
query := `
|
||||
SELECT url, title, type, category, source_host, tld, status, last_error, item_count, language
|
||||
SELECT url, title, type, category, domain_host || '.' || domain_tld as source_host, domain_tld as tld, status, last_error, item_count, language
|
||||
FROM feeds
|
||||
WHERE 1=1`
|
||||
|
||||
if tld != "" {
|
||||
query += fmt.Sprintf(" AND tld = $%d", argNum)
|
||||
query += fmt.Sprintf(" AND domain_tld = $%d", argNum)
|
||||
args = append(args, tld)
|
||||
argNum++
|
||||
}
|
||||
if domain != "" {
|
||||
query += fmt.Sprintf(" AND source_host = $%d", argNum)
|
||||
args = append(args, domain)
|
||||
argNum++
|
||||
// Parse domain into host and tld parts
|
||||
domainHost := stripTLD(domain)
|
||||
domainTLD := getTLD(domain)
|
||||
query += fmt.Sprintf(" AND domain_host = $%d AND domain_tld = $%d", argNum, argNum+1)
|
||||
args = append(args, domainHost, domainTLD)
|
||||
argNum += 2
|
||||
}
|
||||
if status != "" {
|
||||
query += fmt.Sprintf(" AND status = $%d", argNum)
|
||||
|
||||
+8
-6
@@ -150,7 +150,7 @@ func (c *Crawler) handleAPIPublishDenied(w http.ResponseWriter, r *http.Request)
|
||||
result = append(result, FeedDeniedInfo{
|
||||
URL: f.URL,
|
||||
Title: f.Title,
|
||||
SourceHost: f.SourceHost,
|
||||
SourceHost: fullHost(f.DomainHost, f.DomainTLD),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -193,7 +193,7 @@ func (c *Crawler) handleAPIPublishCandidates(w http.ResponseWriter, r *http.Requ
|
||||
URL: f.URL,
|
||||
Title: f.Title,
|
||||
Category: f.Category,
|
||||
SourceHost: f.SourceHost,
|
||||
SourceHost: fullHost(f.DomainHost, f.DomainTLD),
|
||||
ItemCount: f.ItemCount,
|
||||
DerivedHandle: DeriveHandleFromFeed(f.URL),
|
||||
})
|
||||
@@ -346,9 +346,10 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error)
|
||||
// Set up profile
|
||||
feed, _ := c.getFeed(feedURL)
|
||||
if feed != nil {
|
||||
sourceHost := fullHost(feed.DomainHost, feed.DomainTLD)
|
||||
displayName := feed.Title
|
||||
if displayName == "" {
|
||||
displayName = feed.SourceHost
|
||||
displayName = sourceHost
|
||||
}
|
||||
description := feed.Description
|
||||
if description == "" {
|
||||
@@ -366,7 +367,7 @@ func (c *Crawler) ensureFeedAccountExists(feedURL, account string) (bool, error)
|
||||
|
||||
// Try to fetch favicon
|
||||
var avatar *BlobRef
|
||||
faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost)
|
||||
faviconData, mimeType, err := FetchFaviconBytes(sourceHost)
|
||||
if err == nil && len(faviconData) > 0 {
|
||||
avatar, _ = publisher.UploadBlob(session, faviconData, mimeType)
|
||||
}
|
||||
@@ -819,15 +820,16 @@ func (c *Crawler) handleAPIPublishFeedFull(w http.ResponseWriter, r *http.Reques
|
||||
fmt.Printf("Created account: %s (%s)\n", session.Handle, session.DID)
|
||||
|
||||
// Set up profile with feed title and favicon
|
||||
sourceHost := fullHost(feed.DomainHost, feed.DomainTLD)
|
||||
displayName := feed.Title
|
||||
if displayName == "" {
|
||||
displayName = feed.SourceHost
|
||||
displayName = sourceHost
|
||||
}
|
||||
description := feed.Description
|
||||
|
||||
// Try to fetch favicon for avatar
|
||||
var avatar *BlobRef
|
||||
faviconData, mimeType, err := FetchFaviconBytes(feed.SourceHost)
|
||||
faviconData, mimeType, err := FetchFaviconBytes(sourceHost)
|
||||
if err == nil && len(faviconData) > 0 {
|
||||
avatar, err = publisher.UploadBlob(session, faviconData, mimeType)
|
||||
if err != nil {
|
||||
|
||||
+5
-5
@@ -138,17 +138,17 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
||||
return url, sf, true
|
||||
}
|
||||
|
||||
// Search feeds by source_host (LIKE search for domain matching)
|
||||
// Search feeds by domain_host (LIKE search for domain matching)
|
||||
// Use LOWER() to leverage trigram index
|
||||
lowerPattern := "%" + strings.ToLower(query) + "%"
|
||||
hostRows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, site_url,
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
|
||||
item_count, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds
|
||||
WHERE LOWER(source_host) LIKE $1 OR LOWER(url) LIKE $1
|
||||
WHERE LOWER(domain_host || '.' || domain_tld) LIKE $1 OR LOWER(url) LIKE $1
|
||||
LIMIT $2
|
||||
`, lowerPattern, limit)
|
||||
if err == nil {
|
||||
@@ -168,7 +168,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT url, type, category, title, description, language, site_url,
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
|
||||
item_count, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds
|
||||
WHERE search_vector @@ to_tsquery('english', $1)
|
||||
@@ -243,7 +243,7 @@ func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
||||
SELECT type, category, title, description, language, site_url,
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host || '.' || domain_tld as source_host, domain_tld as tld,
|
||||
item_count, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds WHERE url = $1
|
||||
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
|
||||
|
||||
+25
-5
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -42,16 +43,35 @@ func NewCrawler(connString string) (*Crawler, error) {
|
||||
return nil, fmt.Errorf("failed to open database: %v", err)
|
||||
}
|
||||
|
||||
// Custom transport with longer timeouts (HTTP/2 disabled for compatibility)
|
||||
transport := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
MinVersion: tls.VersionTLS12,
|
||||
NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility
|
||||
},
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 30 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).DialContext,
|
||||
ForceAttemptHTTP2: false,
|
||||
MaxIdleConns: 100,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
TLSHandshakeTimeout: 30 * time.Second,
|
||||
ExpectContinueTimeout: 1 * time.Second,
|
||||
ResponseHeaderTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
return &Crawler{
|
||||
MaxDepth: 10,
|
||||
MaxPagesPerHost: 10,
|
||||
Timeout: 10 * time.Second,
|
||||
UserAgent: "FeedCrawler/1.0",
|
||||
Timeout: 60 * time.Second,
|
||||
UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)",
|
||||
startTime: time.Now(),
|
||||
db: db,
|
||||
shutdownCh: make(chan struct{}),
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
Timeout: 60 * time.Second,
|
||||
Transport: transport,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("stopped after 10 redirects")
|
||||
@@ -347,7 +367,7 @@ type FeedInfo struct {
|
||||
func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
|
||||
var title, description, siteURL, sourceHost *string
|
||||
err := c.db.QueryRow(`
|
||||
SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
|
||||
SELECT title, description, site_url, domain_host || '.' || domain_tld as source_host FROM feeds WHERE url = $1
|
||||
`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -363,7 +383,7 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
|
||||
// RefreshAllProfiles updates profiles for all existing accounts with feed URLs
|
||||
func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, description, site_url, source_host, publish_account
|
||||
SELECT url, title, description, site_url, domain_host || '.' || domain_tld as source_host, publish_account
|
||||
FROM feeds
|
||||
WHERE publish_account IS NOT NULL AND publish_account <> ''
|
||||
`)
|
||||
|
||||
+3
-3
@@ -92,9 +92,9 @@ func (c *Crawler) UpdateStats() {
|
||||
|
||||
func (c *Crawler) fetchAllDomainsFromDB() []DomainStat {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT tld, source_host, COUNT(*) as cnt FROM feeds
|
||||
GROUP BY tld, source_host
|
||||
ORDER BY tld, source_host
|
||||
SELECT domain_tld as tld, domain_host || '.' || domain_tld as source_host, COUNT(*) as cnt FROM feeds
|
||||
GROUP BY domain_tld, domain_host
|
||||
ORDER BY domain_tld, domain_host
|
||||
`)
|
||||
if err != nil {
|
||||
fmt.Printf("fetchAllDomainsFromDB error: %v\n", err)
|
||||
|
||||
@@ -36,14 +36,17 @@ CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_t
|
||||
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
url TEXT PRIMARY KEY,
|
||||
domain_host TEXT NOT NULL,
|
||||
domain_tld tld_enum NOT NULL,
|
||||
type TEXT,
|
||||
category TEXT DEFAULT 'main',
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
language TEXT,
|
||||
site_url TEXT,
|
||||
source_url TEXT,
|
||||
|
||||
discovered_at TIMESTAMP NOT NULL,
|
||||
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
last_checked_at TIMESTAMP, -- feed_check: when last checked for new items
|
||||
next_check_at TIMESTAMP, -- feed_check: when to next check
|
||||
last_build_date TIMESTAMP,
|
||||
@@ -51,134 +54,67 @@ CREATE TABLE IF NOT EXISTS feeds (
|
||||
etag TEXT,
|
||||
last_modified TEXT,
|
||||
|
||||
status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')),
|
||||
status TEXT NOT NULL DEFAULT 'pass',
|
||||
last_error TEXT,
|
||||
last_error_at TIMESTAMP,
|
||||
|
||||
source_url TEXT,
|
||||
source_host TEXT,
|
||||
tld TEXT,
|
||||
|
||||
item_count INTEGER,
|
||||
item_count INTEGER NOT NULL DEFAULT 0,
|
||||
oldest_item_date TIMESTAMP,
|
||||
newest_item_date TIMESTAMP,
|
||||
|
||||
no_update INTEGER DEFAULT 0,
|
||||
no_update INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Publishing to PDS
|
||||
publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')),
|
||||
publish_status TEXT NOT NULL DEFAULT 'hold',
|
||||
publish_account TEXT,
|
||||
|
||||
-- Full-text search vector
|
||||
search_vector tsvector GENERATED ALWAYS AS (
|
||||
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
|
||||
setweight(to_tsvector('english', coalesce(url, '')), 'C')
|
||||
) STORED
|
||||
FOREIGN KEY (domain_host, domain_tld) REFERENCES domains(host, tld)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_host ON feeds(source_host);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector);
|
||||
-- idx_feeds_to_check created in migrations after column rename
|
||||
-- Indexes will be added as needed based on query patterns
|
||||
|
||||
CREATE TABLE IF NOT EXISTS items (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
feed_url TEXT NOT NULL,
|
||||
guid TEXT,
|
||||
guid TEXT NOT NULL,
|
||||
feed_url TEXT NOT NULL REFERENCES feeds(url) ON DELETE CASCADE,
|
||||
title TEXT,
|
||||
link TEXT,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
author TEXT,
|
||||
pub_date TIMESTAMP,
|
||||
discovered_at TIMESTAMP NOT NULL,
|
||||
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMP,
|
||||
|
||||
-- Media attachments
|
||||
enclosure_url TEXT,
|
||||
enclosure_type TEXT,
|
||||
enclosure_length BIGINT,
|
||||
image_urls TEXT, -- JSON array of image URLs
|
||||
tags TEXT, -- JSON array of category/tag strings
|
||||
image_urls JSONB,
|
||||
tags JSONB,
|
||||
|
||||
-- Publishing to PDS
|
||||
published_at TIMESTAMP,
|
||||
published_uri TEXT,
|
||||
|
||||
-- Full-text search vector
|
||||
search_vector tsvector GENERATED ALWAYS AS (
|
||||
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
|
||||
setweight(to_tsvector('english', coalesce(content, '')), 'C') ||
|
||||
setweight(to_tsvector('english', coalesce(author, '')), 'D')
|
||||
) STORED,
|
||||
|
||||
UNIQUE(feed_url, guid)
|
||||
PRIMARY KEY (guid, feed_url)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_items_feed_url ON items(feed_url);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_pub_date ON items(pub_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_feed_url_pub_date ON items(feed_url, pub_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feed_url, published_at) WHERE published_at IS NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_items_search ON items USING GIN(search_vector);
|
||||
-- Indexes will be added as needed based on query patterns
|
||||
|
||||
-- URL Shortener tables
|
||||
CREATE TABLE IF NOT EXISTS short_urls (
|
||||
code TEXT PRIMARY KEY,
|
||||
original_url TEXT NOT NULL,
|
||||
item_id BIGINT REFERENCES items(id),
|
||||
feed_url TEXT,
|
||||
created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
|
||||
click_count INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_short_urls_original ON short_urls(original_url);
|
||||
CREATE INDEX IF NOT EXISTS idx_short_urls_item_id ON short_urls(item_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS clicks (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
short_code TEXT NOT NULL REFERENCES short_urls(code),
|
||||
clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
|
||||
referrer TEXT,
|
||||
user_agent TEXT,
|
||||
ip_hash TEXT,
|
||||
country TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_clicks_short_code ON clicks(short_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at DESC);
|
||||
|
||||
-- OAuth sessions (persisted for login persistence across deploys)
|
||||
CREATE TABLE IF NOT EXISTS oauth_sessions (
|
||||
-- OAuth sessions
|
||||
CREATE TABLE IF NOT EXISTS sessions (
|
||||
id TEXT PRIMARY KEY,
|
||||
did TEXT NOT NULL,
|
||||
handle TEXT NOT NULL,
|
||||
created_at TIMESTAMP NOT NULL,
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
access_token TEXT,
|
||||
access_token TEXT NOT NULL,
|
||||
refresh_token TEXT,
|
||||
token_expiry TIMESTAMP,
|
||||
dpop_private_jwk TEXT,
|
||||
dpop_authserver_nonce TEXT,
|
||||
dpop_pds_nonce TEXT,
|
||||
pds_url TEXT,
|
||||
authserver_iss TEXT
|
||||
token_type TEXT NOT NULL DEFAULT 'DPoP',
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
dpop_nonce TEXT,
|
||||
dpop_private_jwk TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_oauth_sessions_expires_at ON oauth_sessions(expires_at);
|
||||
|
||||
-- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.)
|
||||
CREATE OR REPLACE FUNCTION normalize_feed_url()
|
||||
RETURNS TRIGGER AS $$
|
||||
@@ -212,8 +148,8 @@ func OpenDatabase(connString string) (*DB, error) {
|
||||
// Build from individual env vars
|
||||
host := getEnvOrDefault("DB_HOST", "atproto-postgres")
|
||||
port := getEnvOrDefault("DB_PORT", "5432")
|
||||
user := getEnvOrDefault("DB_USER", "news_1440")
|
||||
dbname := getEnvOrDefault("DB_NAME", "news_1440")
|
||||
user := getEnvOrDefault("DB_USER", "dba_1440_news")
|
||||
dbname := getEnvOrDefault("DB_NAME", "db_1440_news")
|
||||
|
||||
// Support Docker secrets (password file) or direct password
|
||||
password := os.Getenv("DB_PASSWORD")
|
||||
@@ -271,7 +207,7 @@ func OpenDatabase(connString string) (*DB, error) {
|
||||
// Indexes must match LOWER() used in queries
|
||||
pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)")
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)")
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_domain_host_trgm ON feeds USING gin (LOWER(domain_host) gin_trgm_ops)")
|
||||
|
||||
// Migration: rename feed columns for consistent terminology
|
||||
// last_crawled_at -> last_checked_at (feed_check = checking feeds for new items)
|
||||
|
||||
@@ -116,8 +116,8 @@ type Feed struct {
|
||||
|
||||
// Discovery source
|
||||
SourceURL string `json:"source_url,omitempty"`
|
||||
SourceHost string `json:"source_host,omitempty"`
|
||||
TLD string `json:"tld,omitempty"`
|
||||
DomainHost string `json:"domain_host,omitempty"`
|
||||
DomainTLD string `json:"domain_tld,omitempty"`
|
||||
|
||||
// Content stats
|
||||
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
|
||||
@@ -139,7 +139,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
// Auto-pass feeds from our own domain
|
||||
publishStatus := feed.PublishStatus
|
||||
if publishStatus == "" {
|
||||
if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
|
||||
if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" {
|
||||
publishStatus = "pass"
|
||||
} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
|
||||
publishStatus = "skip"
|
||||
@@ -156,7 +156,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -188,7 +188,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate),
|
||||
NullableString(feed.ETag), NullableString(feed.LastModified),
|
||||
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
|
||||
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
|
||||
NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD),
|
||||
feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate),
|
||||
feed.NoUpdate,
|
||||
publishStatus, NullableString(feed.PublishAccount),
|
||||
@@ -201,7 +201,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
feed := &Feed{}
|
||||
var category, title, description, language, siteURL *string
|
||||
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
||||
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
||||
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
|
||||
var publishStatus, publishAccount *string
|
||||
var itemCount, noUpdate *int
|
||||
|
||||
@@ -210,7 +210,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -220,7 +220,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.Status, &lastError, &lastErrorAt,
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&sourceURL, &domainHost, &domainTLD,
|
||||
&itemCount, &oldestItemDate, &newestItemDate,
|
||||
&noUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
@@ -251,8 +251,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
feed.LastError = StringValue(lastError)
|
||||
feed.LastErrorAt = TimeValue(lastErrorAt)
|
||||
feed.SourceURL = StringValue(sourceURL)
|
||||
feed.SourceHost = StringValue(sourceHost)
|
||||
feed.TLD = StringValue(tld)
|
||||
feed.DomainHost = StringValue(domainHost)
|
||||
feed.DomainTLD = StringValue(domainTLD)
|
||||
if itemCount != nil {
|
||||
feed.ItemCount = *itemCount
|
||||
}
|
||||
@@ -285,7 +285,7 @@ func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -309,7 +309,7 @@ func (c *Crawler) GetFeedCount() (int, error) {
|
||||
// GetFeedCountByHost returns the number of feeds for a specific host
|
||||
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE source_host = $1", host).Scan(&count)
|
||||
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
@@ -320,7 +320,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -344,11 +344,11 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
FROM feeds WHERE source_host = $1
|
||||
FROM feeds WHERE domain_host = $1
|
||||
`, host)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -366,7 +366,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -390,7 +390,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
||||
feed := &Feed{}
|
||||
var feedType, category, title, description, language, siteURL *string
|
||||
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
|
||||
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
|
||||
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
|
||||
var itemCount, noUpdate *int
|
||||
var status *string
|
||||
var publishStatus, publishAccount *string
|
||||
@@ -400,7 +400,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
||||
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&status, &lastError, &lastErrorAt,
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&sourceURL, &domainHost, &domainTLD,
|
||||
&itemCount, &oldestItemDate, &newestItemDate,
|
||||
&noUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
@@ -428,8 +428,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
||||
feed.LastError = StringValue(lastError)
|
||||
feed.LastErrorAt = TimeValue(lastErrorAt)
|
||||
feed.SourceURL = StringValue(sourceURL)
|
||||
feed.SourceHost = StringValue(sourceHost)
|
||||
feed.TLD = StringValue(tld)
|
||||
feed.DomainHost = StringValue(domainHost)
|
||||
feed.DomainTLD = StringValue(domainTLD)
|
||||
if itemCount != nil {
|
||||
feed.ItemCount = *itemCount
|
||||
}
|
||||
@@ -474,7 +474,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
@@ -496,7 +496,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||
discovered_at, last_checked_at, next_check_at, last_build_date,
|
||||
etag, last_modified,
|
||||
status, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
source_url, domain_host, domain_tld,
|
||||
item_count, oldest_item_date, newest_item_date,
|
||||
no_update,
|
||||
publish_status, publish_account
|
||||
|
||||
+4
-4
@@ -33,8 +33,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
DiscoveredAt: now,
|
||||
LastCheckedAt: now,
|
||||
Status: "pass",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
DomainHost: getDomainHost(sourceHost),
|
||||
DomainTLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
@@ -90,8 +90,8 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
DiscoveredAt: now,
|
||||
Status: "pass",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
DomainHost: getDomainHost(sourceHost),
|
||||
DomainTLD: getTLD(sourceHost),
|
||||
NextCheckAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
|
||||
+2
-2
@@ -445,8 +445,8 @@ const dashboardHTML = `<!DOCTYPE html>
|
||||
<title>1440.news Feed Crawler</title>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<link rel="stylesheet" href="/static/dashboard.css?v=1769990750">
|
||||
<script src="/static/dashboard.js?v=1769990750"></script>
|
||||
<link rel="stylesheet" href="/static/dashboard.css?v=1769995130">
|
||||
<script src="/static/dashboard.js?v=1769995130"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div id="topSection">
|
||||
|
||||
@@ -61,6 +61,12 @@ func stripTLD(host string) string {
|
||||
return host
|
||||
}
|
||||
|
||||
// getDomainHost extracts the host part from a full domain (without TLD)
|
||||
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
|
||||
func getDomainHost(domain string) string {
|
||||
return stripTLD(domain)
|
||||
}
|
||||
|
||||
// fullHost reconstructs the full hostname from host and tld
|
||||
// e.g., ("example", "com") -> "example.com"
|
||||
func fullHost(host, tld string) string {
|
||||
|
||||
Reference in New Issue
Block a user