Refactor large Go files into focused modules

Split dashboard.go (3,528 lines) into:
- routes.go: HTTP route registration
- api_domains.go: Domain API handlers
- api_feeds.go: Feed API handlers
- api_publish.go: Publishing API handlers
- api_search.go: Search API handlers
- templates.go: HTML templates
- dashboard.go: Stats functions only (235 lines)

Split publisher.go (1,502 lines) into:
- pds_auth.go: Authentication and account management
- pds_records.go: Record operations (upload, update, delete)
- handle.go: Handle derivation from feed URLs
- image.go: Image processing and favicon fetching
- publisher.go: Core types and PublishItem (439 lines)

Split feed.go (1,137 lines) into:
- item.go: Item struct and DB operations
- feed_check.go: Feed checking and processing
- feed.go: Feed struct and DB operations (565 lines)

Also includes domain import batch size increase (1k -> 100k).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-29 22:25:02 -05:00
parent 3999e96f26
commit 1066f42189
17 changed files with 5106 additions and 4957 deletions
+764
View File
@@ -0,0 +1,764 @@
package main
import (
"encoding/json"
"fmt"
"net/http"
"strings"
"github.com/jackc/pgx/v5"
)
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
offset := 0
limit := 100
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
// Serve from cache (updated once per minute in background)
c.statsMu.RLock()
cached := c.cachedAllDomains
c.statsMu.RUnlock()
var domains []DomainStat
if cached != nil && offset < len(cached) {
end := offset + limit
if end > len(cached) {
end = len(cached)
}
domains = cached[offset:end]
}
if domains == nil {
domains = []DomainStat{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
// handleAPIDomains lists domains with optional status filter, including their feeds
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// First get domains
var rows pgx.Rows
var err error
if hasFeeds {
// Only domains with feeds
if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
FROM domains d
INNER JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
} else if status != "" {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
} else {
rows, err = c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
ORDER BY d.tld ASC, d.host ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type,omitempty"`
Status string `json:"status,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
}
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
Feeds []FeedInfo `json:"feeds,omitempty"`
}
var domains []DomainInfo
var hosts []string
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
hosts = append(hosts, d.Host)
}
// Now get feeds for these domains
if len(hosts) > 0 {
feedRows, err := c.db.Query(`
SELECT source_host, url, title, type, status, publish_status
FROM feeds
WHERE source_host = ANY($1)
ORDER BY source_host, url
`, hosts)
if err == nil {
defer feedRows.Close()
feedsByHost := make(map[string][]FeedInfo)
for feedRows.Next() {
var host string
var f FeedInfo
var title, feedType, status, publishStatus *string
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus); err != nil {
continue
}
f.Title = StringValue(title)
f.Type = StringValue(feedType)
f.Status = StringValue(status)
f.PublishStatus = StringValue(publishStatus)
feedsByHost[host] = append(feedsByHost[host], f)
}
// Attach feeds to domains
for i := range domains {
if feeds, ok := feedsByHost[domains[i].Host]; ok {
domains[i].Feeds = feeds
}
}
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
if status == "" {
http.Error(w, "status parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.status = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tld, lastError *string
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tld)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT url, title, type, status, error_count, last_error, item_count, publish_status, language
FROM feeds
WHERE source_host = $1
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, host, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"error_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, status, lastError, publishStatus, language *string
var errorCount, itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil {
continue
}
f.Title = StringValue(title)
f.Status = StringValue(status)
f.LastError = StringValue(lastError)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if errorCount != nil {
f.ErrorCount = *errorCount
}
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
// handleAPISetDomainStatus sets the status for a domain
// status must be 'hold', 'pass', 'skip', or 'fail'
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
status := r.URL.Query().Get("status")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
if status != "hold" && status != "pass" && status != "skip" && status != "fail" {
http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail'", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// When setting to pass, clear any last_error
var err error
if status == "pass" {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_error = NULL
WHERE host = $2
`, status, host)
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = $1
WHERE host = $2
`, status, host)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"host": host,
"status": status,
})
}
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
_, err := c.db.Exec(`
UPDATE domains SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
WHERE host = $1
`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
}
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
host = normalizeHost(host)
// Add domain if it doesn't exist, or reset to pass for crawling
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'pass', NOW(), $2)
ON CONFLICT(host) DO UPDATE SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
`, host, getTLD(host))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Crawl synchronously
fmt.Printf("Priority crawl: %s\n", host)
feedsFound, crawlErr := c.crawlHost(host)
errStr := ""
if crawlErr != nil {
errStr = crawlErr.Error()
}
// Mark as crawled
c.markDomainCrawled(host, feedsFound, errStr)
// Get the feeds we found
feeds, _ := c.GetFeedsByHost(host)
type FeedSummary struct {
URL string `json:"url"`
Title string `json:"title"`
Type string `json:"type"`
Category string `json:"category"`
Status string `json:"status"`
}
var feedSummaries []FeedSummary
for _, f := range feeds {
feedSummaries = append(feedSummaries, FeedSummary{
URL: f.URL,
Title: f.Title,
Type: f.Type,
Category: f.Category,
Status: f.Status,
})
}
result := map[string]interface{}{
"host": host,
"feeds_found": feedsFound,
"feeds": feedSummaries,
}
if crawlErr != nil {
result["error"] = crawlErr.Error()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// handleAPIFilter handles flexible filtering with stackable parameters
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
domain := r.URL.Query().Get("domain")
feedStatus := r.URL.Query().Get("feedStatus")
domainStatus := r.URL.Query().Get("domainStatus")
languages := r.URL.Query().Get("languages") // comma-separated list
show := r.URL.Query().Get("show") // "feeds" or "domains"
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
// Parse languages into slice
var langList []string
if languages != "" {
for _, lang := range strings.Split(languages, ",") {
lang = strings.TrimSpace(lang)
if lang != "" {
langList = append(langList, lang)
}
}
}
// Determine what to show based on filters
if show == "" {
if feedStatus != "" || domain != "" || len(langList) > 0 {
show = "feeds"
} else {
show = "domains"
}
}
if show == "feeds" {
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
} else {
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
}
}
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
var args []interface{}
argNum := 1
query := `
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND d.status = $%d", argNum)
args = append(args, status)
argNum++
}
// Sort by feed count descending or alphabetically
if sort == "feeds" {
query += fmt.Sprintf(" ORDER BY feed_count DESC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
} else {
query += fmt.Sprintf(" ORDER BY d.tld ASC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
}
args = append(args, limit, offset)
rows, err := c.db.Query(query, args...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
TLD string `json:"tld"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var tldVal, lastError *string
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.TLD = StringValue(tldVal)
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"type": "domains",
"data": domains,
})
}
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT d.host, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
FROM domains d
LEFT JOIN (
SELECT source_host, COUNT(*) as feed_count
FROM feeds
GROUP BY source_host
) f ON d.host = f.source_host
WHERE d.tld = $1
ORDER BY d.tld ASC, d.host ASC
LIMIT $2 OFFSET $3
`, tld, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type DomainInfo struct {
Host string `json:"host"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
FeedCount int `json:"feed_count"`
}
var domains []DomainInfo
for rows.Next() {
var d DomainInfo
var lastError *string
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
continue
}
d.LastError = StringValue(lastError)
domains = append(domains, d)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(domains)
}
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
var rows pgx.Rows
var err error
if hasFeeds {
// Only TLDs that have domains with feeds
rows, err = c.db.Query(`
SELECT DISTINCT d.tld, COUNT(DISTINCT d.host) as domain_count
FROM domains d
INNER JOIN feeds f ON d.host = f.source_host
WHERE d.tld IS NOT NULL AND d.tld != ''
GROUP BY d.tld
ORDER BY d.tld ASC
`)
} else {
// All TLDs
rows, err = c.db.Query(`
SELECT tld, COUNT(*) as domain_count
FROM domains
WHERE tld IS NOT NULL AND tld != ''
GROUP BY tld
ORDER BY tld ASC
`)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type TLDInfo struct {
TLD string `json:"tld"`
DomainCount int `json:"domain_count"`
}
var tlds []TLDInfo
for rows.Next() {
var t TLDInfo
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
continue
}
tlds = append(tlds, t)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(tlds)
}
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
tld := r.URL.Query().Get("tld")
if tld == "" {
http.Error(w, "tld parameter required", http.StatusBadRequest)
return
}
var domainCount, feedCount int
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE tld = $1`, tld).Scan(&domainCount)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE tld = $1`, tld).Scan(&feedCount)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"tld": tld,
"domain_count": domainCount,
"feed_count": feedCount,
})
}
// handleAPIDenyDomain skips a domain and all its feeds
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Update domain status to skip
_, err := c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Skip all feeds from this domain
feedsAffected, err := c.db.Exec(`UPDATE feeds SET publish_status = 'skip', status = 'dead' WHERE source_host = $1`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"success": true,
"host": host,
"feeds_skipped": feedsAffected,
})
}
// handleAPIUndenyDomain removes skip status from a domain
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
host := r.URL.Query().Get("host")
if host == "" {
http.Error(w, "host parameter required", http.StatusBadRequest)
return
}
// Update domain status back to pass
_, err := c.db.Exec(`UPDATE domains SET status = 'pass' WHERE host = $1 AND status = 'skip'`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Restore feeds to hold status and active
feedsRestored, err := c.db.Exec(`UPDATE feeds SET publish_status = 'hold', status = 'active' WHERE source_host = $1 AND status = 'dead'`, host)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"success": true,
"host": host,
"feeds_restored": feedsRestored,
})
}
+504
View File
@@ -0,0 +1,504 @@
package main
import (
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"github.com/jackc/pgx/v5"
)
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
type FeedDetails struct {
URL string `json:"url"`
Type string `json:"type,omitempty"`
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"siteUrl,omitempty"`
DiscoveredAt string `json:"discoveredAt,omitempty"`
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
LastBuildDate string `json:"lastBuildDate,omitempty"`
TTLMinutes int `json:"ttlMinutes,omitempty"`
UpdatePeriod string `json:"updatePeriod,omitempty"`
UpdateFreq int `json:"updateFreq,omitempty"`
Status string `json:"status,omitempty"`
ErrorCount int `json:"errorCount,omitempty"`
LastError string `json:"lastError,omitempty"`
ItemCount int `json:"itemCount,omitempty"`
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
OldestItemDate string `json:"oldestItemDate,omitempty"`
NewestItemDate string `json:"newestItemDate,omitempty"`
PublishStatus string `json:"publishStatus,omitempty"`
PublishAccount string `json:"publishAccount,omitempty"`
}
var f FeedDetails
var title, description, language, siteUrl *string
var lastCrawledAt, lastBuildDate *time.Time
var updatePeriod, status, lastError *string
var oldestItemDate, newestItemDate *time.Time
var ttlMinutes, updateFreq, errorCount, itemCount *int
var avgPostFreqHrs *float64
var discoveredAt time.Time
var publishStatus, publishAccount *string
err := c.db.QueryRow(`
SELECT url, type, title, description, language, site_url,
discovered_at, last_crawled_at, last_build_date,
ttl_minutes, update_period, update_freq,
status, error_count, last_error,
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date,
publish_status, publish_account
FROM feeds WHERE url = $1
`, feedURL).Scan(
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
&discoveredAt, &lastCrawledAt, &lastBuildDate,
&ttlMinutes, &updatePeriod, &updateFreq,
&status, &errorCount, &lastError,
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&publishStatus, &publishAccount,
)
if err == pgx.ErrNoRows {
http.Error(w, "feed not found", http.StatusNotFound)
return
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
f.Title = StringValue(title)
f.Description = StringValue(description)
f.Language = StringValue(language)
f.SiteURL = StringValue(siteUrl)
f.DiscoveredAt = discoveredAt.Format(time.RFC3339)
if lastCrawledAt != nil {
f.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
}
if lastBuildDate != nil {
f.LastBuildDate = lastBuildDate.Format(time.RFC3339)
}
if ttlMinutes != nil {
f.TTLMinutes = *ttlMinutes
}
f.UpdatePeriod = StringValue(updatePeriod)
if updateFreq != nil {
f.UpdateFreq = *updateFreq
}
f.Status = StringValue(status)
if errorCount != nil {
f.ErrorCount = *errorCount
}
f.LastError = StringValue(lastError)
if itemCount != nil {
f.ItemCount = *itemCount
}
if avgPostFreqHrs != nil {
f.AvgPostFreqHrs = *avgPostFreqHrs
}
if oldestItemDate != nil {
f.OldestItemDate = oldestItemDate.Format(time.RFC3339)
}
if newestItemDate != nil {
f.NewestItemDate = newestItemDate.Format(time.RFC3339)
}
f.PublishStatus = StringValue(publishStatus)
f.PublishAccount = StringValue(publishAccount)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(f)
}
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
limit := 50
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 100 {
limit = 100
}
}
items, err := c.GetItemsByFeed(feedURL, limit)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if items == nil {
items = []*Item{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(items)
}
func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) {
status := r.URL.Query().Get("status")
if status == "" {
http.Error(w, "status parameter required", http.StatusBadRequest)
return
}
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
rows, err := c.db.Query(`
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count
FROM feeds
WHERE status = $1
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, status, limit, offset)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type"`
SourceHost string `json:"source_host"`
TLD string `json:"tld"`
Status string `json:"status"`
ErrorCount int `json:"error_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, sourceHost, tld, lastError *string
var errorCount, itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount); err != nil {
continue
}
f.Title = StringValue(title)
f.SourceHost = StringValue(sourceHost)
f.TLD = StringValue(tld)
f.LastError = StringValue(lastError)
if errorCount != nil {
f.ErrorCount = *errorCount
}
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
// handleAPIFeeds lists feeds with optional publish_status filter
func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
publishStatus := r.URL.Query().Get("publish_status")
limit := 100
offset := 0
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
if o := r.URL.Query().Get("offset"); o != "" {
fmt.Sscanf(o, "%d", &offset)
}
var rows pgx.Rows
var err error
if publishStatus != "" {
rows, err = c.db.Query(`
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language
FROM feeds
WHERE publish_status = $1
ORDER BY url ASC
LIMIT $2 OFFSET $3
`, publishStatus, limit, offset)
} else {
rows, err = c.db.Query(`
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language
FROM feeds
ORDER BY url ASC
LIMIT $1 OFFSET $2
`, limit, offset)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type"`
SourceHost string `json:"source_host"`
TLD string `json:"tld"`
Status string `json:"status"`
ErrorCount int `json:"error_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
PublishStatus string `json:"publish_status,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, sourceHost, tld, lastError, publishStatus, language *string
var errorCount, itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil {
continue
}
f.Title = StringValue(title)
f.SourceHost = StringValue(sourceHost)
f.TLD = StringValue(tld)
f.LastError = StringValue(lastError)
f.PublishStatus = StringValue(publishStatus)
f.Language = StringValue(language)
if errorCount != nil {
f.ErrorCount = *errorCount
}
if itemCount != nil {
f.ItemCount = *itemCount
}
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(feeds)
}
func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, languages []string, limit, offset int) {
var args []interface{}
argNum := 1
query := `
SELECT url, title, type, category, source_host, tld, status, error_count, last_error, item_count, language
FROM feeds
WHERE 1=1`
if tld != "" {
query += fmt.Sprintf(" AND tld = $%d", argNum)
args = append(args, tld)
argNum++
}
if domain != "" {
query += fmt.Sprintf(" AND source_host = $%d", argNum)
args = append(args, domain)
argNum++
}
if status != "" {
query += fmt.Sprintf(" AND status = $%d", argNum)
args = append(args, status)
argNum++
}
if len(languages) > 0 {
// Build IN clause for languages, handling 'unknown' as empty string
placeholders := make([]string, len(languages))
for i, lang := range languages {
placeholders[i] = fmt.Sprintf("$%d", argNum)
if lang == "unknown" {
args = append(args, "")
} else {
args = append(args, lang)
}
argNum++
}
query += fmt.Sprintf(" AND COALESCE(language, '') IN (%s)", strings.Join(placeholders, ","))
}
query += fmt.Sprintf(" ORDER BY url ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
args = append(args, limit, offset)
rows, err := c.db.Query(query, args...)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type FeedInfo struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Type string `json:"type"`
Category string `json:"category"`
SourceHost string `json:"source_host"`
TLD string `json:"tld"`
Status string `json:"status"`
ErrorCount int `json:"error_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ItemCount int `json:"item_count,omitempty"`
Language string `json:"language,omitempty"`
}
var feeds []FeedInfo
for rows.Next() {
var f FeedInfo
var title, category, sourceHost, tldVal, lastError, language *string
var errorCount, itemCount *int
if err := rows.Scan(&f.URL, &title, &f.Type, &category, &sourceHost, &tldVal, &f.Status, &errorCount, &lastError, &itemCount, &language); err != nil {
continue
}
f.Title = StringValue(title)
if category != nil && *category != "" {
f.Category = *category
} else {
f.Category = "main"
}
f.SourceHost = StringValue(sourceHost)
f.TLD = StringValue(tldVal)
f.LastError = StringValue(lastError)
if errorCount != nil {
f.ErrorCount = *errorCount
}
if itemCount != nil {
f.ItemCount = *itemCount
}
f.Language = StringValue(language)
feeds = append(feeds, f)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"type": "feeds",
"data": feeds,
})
}
// handleAPICheckFeed immediately checks a feed and returns items
func (c *Crawler) handleAPICheckFeed(w http.ResponseWriter, r *http.Request) {
feedURL := r.URL.Query().Get("url")
if feedURL == "" {
http.Error(w, "url parameter required", http.StatusBadRequest)
return
}
force := r.URL.Query().Get("force") == "true"
feedURL = normalizeURL(feedURL)
// Get the feed
feed, err := c.getFeed(feedURL)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if feed == nil {
http.Error(w, "feed not found", http.StatusNotFound)
return
}
// Clear cache headers if force is requested
if force {
feed.ETag = ""
feed.LastModified = ""
}
// Force check the feed
fmt.Printf("Force check feed: %s (force=%v)\n", feedURL, force)
changed, checkErr := c.CheckFeed(feed)
// Get updated feed info
feed, _ = c.getFeed(feedURL)
// Get items
items, _ := c.GetItemsByFeed(feedURL, 20)
type ItemSummary struct {
Title string `json:"title"`
Link string `json:"link"`
PubDate string `json:"pub_date,omitempty"`
Author string `json:"author,omitempty"`
}
var itemSummaries []ItemSummary
for _, item := range items {
is := ItemSummary{
Title: item.Title,
Link: item.Link,
Author: item.Author,
}
if !item.PubDate.IsZero() {
is.PubDate = item.PubDate.Format("2006-01-02 15:04")
}
itemSummaries = append(itemSummaries, is)
}
result := map[string]interface{}{
"url": feedURL,
"title": feed.Title,
"type": feed.Type,
"category": feed.Category,
"status": feed.Status,
"changed": changed,
"itemCount": feed.ItemCount,
"items": itemSummaries,
}
if checkErr != nil {
result["error"] = checkErr.Error()
}
if feed.LastError != "" {
result["lastError"] = feed.LastError
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// handleAPILanguages returns distinct languages with counts
func (c *Crawler) handleAPILanguages(w http.ResponseWriter, r *http.Request) {
rows, err := c.db.Query(`
SELECT COALESCE(NULLIF(language, ''), 'unknown') as lang, COUNT(*) as cnt
FROM feeds
GROUP BY lang
ORDER BY cnt DESC
`)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer rows.Close()
type LangInfo struct {
Language string `json:"language"`
Count int `json:"count"`
}
var languages []LangInfo
for rows.Next() {
var l LangInfo
if err := rows.Scan(&l.Language, &l.Count); err != nil {
continue
}
languages = append(languages, l)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(languages)
}
+1026
View File
File diff suppressed because it is too large Load Diff
+349
View File
@@ -0,0 +1,349 @@
package main
import (
"encoding/json"
"fmt"
"net/http"
"time"
"github.com/jackc/pgx/v5"
)
// SearchResult represents a search result with feed and matching items
type SearchResult struct {
Feed SearchFeed `json:"feed"`
Items []SearchItem `json:"items"`
}
type SearchFeed struct {
URL string `json:"url"`
Type string `json:"type"`
Category string `json:"category"`
Title string `json:"title"`
Description string `json:"description"`
Language string `json:"language"`
SiteURL string `json:"site_url"`
DiscoveredAt string `json:"discovered_at"`
LastCrawledAt string `json:"last_crawled_at"`
NextCrawlAt string `json:"next_crawl_at"`
LastBuildDate string `json:"last_build_date"`
TTLMinutes int `json:"ttl_minutes"`
UpdatePeriod string `json:"update_period"`
UpdateFreq int `json:"update_freq"`
Status string `json:"status"`
ErrorCount int `json:"error_count"`
LastError string `json:"last_error"`
LastErrorAt string `json:"last_error_at"`
SourceURL string `json:"source_url"`
SourceHost string `json:"source_host"`
TLD string `json:"tld"`
ItemCount int `json:"item_count"`
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs"`
OldestItemDate string `json:"oldest_item_date"`
NewestItemDate string `json:"newest_item_date"`
NoUpdate bool `json:"no_update"`
}
type SearchItem struct {
ID int64 `json:"id"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid"`
Title string `json:"title"`
Link string `json:"link"`
Description string `json:"description"`
Content string `json:"content"`
Author string `json:"author"`
PubDate string `json:"pub_date"`
DiscoveredAt string `json:"discovered_at"`
UpdatedAt string `json:"updated_at"`
}
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
if query == "" {
http.Error(w, "q parameter required", http.StatusBadRequest)
return
}
limit := 100
if l := r.URL.Query().Get("limit"); l != "" {
fmt.Sscanf(l, "%d", &limit)
if limit > 500 {
limit = 500
}
}
// Results map: feedURL -> SearchResult
results := make(map[string]*SearchResult)
// Helper to scan feed row into SearchFeed
scanFeed := func(rows pgx.Rows) (string, SearchFeed, bool) {
var url string
var feedType, category, title, description, language, siteUrl *string
var discoveredAt time.Time
var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time
var ttlMinutes, updateFreq, errorCount, itemCount *int
var updatePeriod, status, lastError *string
var lastErrorAt *time.Time
var sourceUrl, sourceHost, tld *string
var avgPostFreqHrs *float64
var oldestItemDate, newestItemDate *time.Time
var noUpdate *bool
if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl,
&discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&ttlMinutes, &updatePeriod, &updateFreq,
&status, &errorCount, &lastError, &lastErrorAt,
&sourceUrl, &sourceHost, &tld,
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &noUpdate); err != nil {
return "", SearchFeed{}, false
}
cat := StringValue(category)
if cat == "" {
cat = "main"
}
sf := SearchFeed{
URL: url,
Type: StringValue(feedType),
Category: cat,
Title: StringValue(title),
Description: StringValue(description),
Language: StringValue(language),
SiteURL: StringValue(siteUrl),
DiscoveredAt: discoveredAt.Format(time.RFC3339),
UpdatePeriod: StringValue(updatePeriod),
Status: StringValue(status),
LastError: StringValue(lastError),
SourceURL: StringValue(sourceUrl),
SourceHost: StringValue(sourceHost),
TLD: StringValue(tld),
}
if lastCrawledAt != nil {
sf.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
}
if nextCrawlAt != nil {
sf.NextCrawlAt = nextCrawlAt.Format(time.RFC3339)
}
if lastBuildDate != nil {
sf.LastBuildDate = lastBuildDate.Format(time.RFC3339)
}
if ttlMinutes != nil {
sf.TTLMinutes = *ttlMinutes
}
if updateFreq != nil {
sf.UpdateFreq = *updateFreq
}
if errorCount != nil {
sf.ErrorCount = *errorCount
}
if lastErrorAt != nil {
sf.LastErrorAt = lastErrorAt.Format(time.RFC3339)
}
if itemCount != nil {
sf.ItemCount = *itemCount
}
if avgPostFreqHrs != nil {
sf.AvgPostFreqHrs = *avgPostFreqHrs
}
if oldestItemDate != nil {
sf.OldestItemDate = oldestItemDate.Format(time.RFC3339)
}
if newestItemDate != nil {
sf.NewestItemDate = newestItemDate.Format(time.RFC3339)
}
if noUpdate != nil {
sf.NoUpdate = *noUpdate
}
return url, sf, true
}
// Search feeds by source_host (LIKE search for domain matching)
hostRows, err := c.db.Query(`
SELECT url, type, category, title, description, language, site_url,
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
ttl_minutes, update_period, update_freq,
status, error_count, last_error, last_error_at,
source_url, source_host, tld,
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
FROM feeds
WHERE source_host ILIKE $1 OR url ILIKE $1
LIMIT $2
`, "%"+query+"%", limit)
if err == nil {
defer hostRows.Close()
for hostRows.Next() {
if url, feed, ok := scanFeed(hostRows); ok {
if _, exists := results[url]; !exists {
results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}}
}
}
}
}
// Search feeds via full-text search
tsQuery := ToSearchQuery(query)
feedRows, err := c.db.Query(`
SELECT url, type, category, title, description, language, site_url,
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
ttl_minutes, update_period, update_freq,
status, error_count, last_error, last_error_at,
source_url, source_host, tld,
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
FROM feeds
WHERE search_vector @@ to_tsquery('english', $1)
LIMIT $2
`, tsQuery, limit)
if err == nil {
defer feedRows.Close()
for feedRows.Next() {
if url, feed, ok := scanFeed(feedRows); ok {
if _, exists := results[url]; !exists {
results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}}
}
}
}
}
// Search items via full-text search
itemRows, err := c.db.Query(`
SELECT i.id, i.feed_url, i.guid, i.title, i.link, i.description, i.content, i.author, i.pub_date, i.discovered_at, i.updated_at
FROM items i
WHERE i.search_vector @@ to_tsquery('english', $1)
ORDER BY i.pub_date DESC
LIMIT $2
`, tsQuery, limit)
if err == nil {
defer itemRows.Close()
for itemRows.Next() {
var id int64
var feedUrl string
var guid, title, link, description, content, author *string
var pubDate, discoveredAt, updatedAt *time.Time
if err := itemRows.Scan(&id, &feedUrl, &guid, &title, &link, &description, &content, &author, &pubDate, &discoveredAt, &updatedAt); err != nil {
continue
}
item := SearchItem{
ID: id,
FeedURL: feedUrl,
GUID: StringValue(guid),
Title: StringValue(title),
Link: StringValue(link),
Description: StringValue(description),
Content: StringValue(content),
Author: StringValue(author),
}
if pubDate != nil {
item.PubDate = pubDate.Format(time.RFC3339)
}
if discoveredAt != nil {
item.DiscoveredAt = discoveredAt.Format(time.RFC3339)
}
if updatedAt != nil {
item.UpdatedAt = updatedAt.Format(time.RFC3339)
}
// Add to existing result or create new one
if result, exists := results[feedUrl]; exists {
result.Items = append(result.Items, item)
} else {
// Fetch feed info for this item's feed
var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string
var fDiscoveredAt time.Time
var fLastCrawledAt, fNextCrawlAt, fLastBuildDate *time.Time
var fTTLMinutes, fUpdateFreq, fErrorCount, fItemCount *int
var fUpdatePeriod, fStatus, fLastError *string
var fLastErrorAt *time.Time
var fSourceUrl, fSourceHost, fTLD *string
var fAvgPostFreqHrs *float64
var fOldestItemDate, fNewestItemDate *time.Time
var fNoUpdate *bool
c.db.QueryRow(`
SELECT type, category, title, description, language, site_url,
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
ttl_minutes, update_period, update_freq,
status, error_count, last_error, last_error_at,
source_url, source_host, tld,
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
FROM feeds WHERE url = $1
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
&fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate,
&fTTLMinutes, &fUpdatePeriod, &fUpdateFreq,
&fStatus, &fErrorCount, &fLastError, &fLastErrorAt,
&fSourceUrl, &fSourceHost, &fTLD,
&fItemCount, &fAvgPostFreqHrs, &fOldestItemDate, &fNewestItemDate, &fNoUpdate)
fCat := StringValue(fCategory)
if fCat == "" {
fCat = "main"
}
sf := SearchFeed{
URL: feedUrl,
Type: StringValue(fType),
Category: fCat,
Title: StringValue(fTitle),
Description: StringValue(fDesc),
Language: StringValue(fLang),
SiteURL: StringValue(fSiteUrl),
DiscoveredAt: fDiscoveredAt.Format(time.RFC3339),
UpdatePeriod: StringValue(fUpdatePeriod),
Status: StringValue(fStatus),
LastError: StringValue(fLastError),
SourceURL: StringValue(fSourceUrl),
SourceHost: StringValue(fSourceHost),
TLD: StringValue(fTLD),
}
if fLastCrawledAt != nil {
sf.LastCrawledAt = fLastCrawledAt.Format(time.RFC3339)
}
if fNextCrawlAt != nil {
sf.NextCrawlAt = fNextCrawlAt.Format(time.RFC3339)
}
if fLastBuildDate != nil {
sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339)
}
if fTTLMinutes != nil {
sf.TTLMinutes = *fTTLMinutes
}
if fUpdateFreq != nil {
sf.UpdateFreq = *fUpdateFreq
}
if fErrorCount != nil {
sf.ErrorCount = *fErrorCount
}
if fLastErrorAt != nil {
sf.LastErrorAt = fLastErrorAt.Format(time.RFC3339)
}
if fItemCount != nil {
sf.ItemCount = *fItemCount
}
if fAvgPostFreqHrs != nil {
sf.AvgPostFreqHrs = *fAvgPostFreqHrs
}
if fOldestItemDate != nil {
sf.OldestItemDate = fOldestItemDate.Format(time.RFC3339)
}
if fNewestItemDate != nil {
sf.NewestItemDate = fNewestItemDate.Format(time.RFC3339)
}
if fNoUpdate != nil {
sf.NoUpdate = *fNoUpdate
}
results[feedUrl] = &SearchResult{
Feed: sf,
Items: []SearchItem{item},
}
}
}
}
// Convert map to slice
var resultList []SearchResult
for _, r := range results {
resultList = append(resultList, *r)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resultList)
}
-3293
View File
File diff suppressed because it is too large Load Diff
+2
View File
@@ -360,6 +360,8 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
totalImported += int(imported)
atomic.AddInt32(&c.domainsImported, int32(imported))
fmt.Printf("Import batch %d: %d domains (total: %d)\n", batchCount, imported, totalImported)
// Wait 1 second before the next batch
time.Sleep(1 * time.Second)
}
-572
View File
@@ -1,15 +1,9 @@
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync/atomic"
"time"
"github.com/jackc/pgx/v5"
@@ -95,37 +89,6 @@ func classifyFeedByTitle(title string, currentCategory string) string {
return currentCategory
}
// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
URL string `json:"url"`
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
Length int64 `json:"length"` // Size in bytes
}
// Item represents an individual entry/article from a feed
type Item struct {
ID int64 `json:"id,omitempty"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
PublishedUri string `json:"published_uri,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
@@ -537,505 +500,6 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
return feeds, rows.Err()
}
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
func (c *Crawler) saveItem(item *Item) error {
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
var enclosureLength *int64
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
if item.Enclosure.Length > 0 {
enclosureLength = &item.Enclosure.Length
}
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := c.db.Exec(`
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
description = EXCLUDED.description,
content = EXCLUDED.content,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
updated_at = EXCLUDED.updated_at,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback(context.Background())
for _, item := range items {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
var enclosureLength *int64
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
if item.Enclosure.Length > 0 {
enclosureLength = &item.Enclosure.Length
}
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := tx.Exec(context.Background(), `
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
description = EXCLUDED.description,
content = EXCLUDED.content,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
updated_at = EXCLUDED.updated_at,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit(context.Background())
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1
ORDER BY pub_date DESC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
tsquery := ToSearchQuery(query)
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE search_vector @@ to_tsquery('english', $1)
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
LIMIT $2
`, tsquery, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// scanItems is a helper to scan multiple item rows
func scanItems(rows pgx.Rows) ([]*Item, error) {
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author *string
var pubDate, updatedAt, publishedAt *time.Time
var enclosureUrl, enclosureType *string
var enclosureLength *int64
var imageUrlsJSON, tagsJSON *string
var publishedUri *string
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
&publishedAt, &publishedUri,
); err != nil {
continue
}
item.GUID = StringValue(guid)
item.Title = StringValue(title)
item.Link = StringValue(link)
item.Description = StringValue(description)
item.Content = StringValue(content)
item.Author = StringValue(author)
item.PubDate = TimeValue(pubDate)
item.UpdatedAt = TimeValue(updatedAt)
// Parse enclosure
if enclosureUrl != nil && *enclosureUrl != "" {
item.Enclosure = &Enclosure{
URL: *enclosureUrl,
Type: StringValue(enclosureType),
}
if enclosureLength != nil {
item.Enclosure.Length = *enclosureLength
}
}
// Parse imageUrls JSON
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
var urls []string
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
item.ImageURLs = urls
}
}
// Parse tags JSON
if tagsJSON != nil && *tagsJSON != "" {
var tags []string
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
item.Tags = tags
}
}
item.PublishedAt = TimeValue(publishedAt)
item.PublishedUri = StringValue(publishedUri)
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
`, cutoff)
if err != nil {
return 0, err
}
return result, nil
}
// processFeed parses and stores a feed with full metadata
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
feedType := c.detectFeedType(body)
now := time.Now()
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
ETag: headers.Get("ETag"),
LastModified: headers.Get("Last-Modified"),
}
// Parse feed-specific metadata and items
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
}
// Refine category based on parsed title (e.g., "Comments on:")
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
// Calculate next crawl time
feed.NextCrawlAt = c.calculateNextCrawl(feed)
if err := c.saveFeed(feed); err != nil {
return
}
// Save items
if len(items) > 0 {
c.saveItems(items)
}
}
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
now := time.Now()
normalizedURL := normalizeURL(feedURL)
feed := &Feed{
URL: normalizedURL,
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
Status: "active",
SourceURL: normalizeURL(sourceURL),
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
NextCrawlAt: now, // Should be crawled immediately
}
if err := c.saveFeed(feed); err != nil {
return
}
}
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: changed (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
var successURL string
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
successURL = tryURL
break
}
}
_ = successURL // May be used later for logging/debugging
// If no request succeeded, resp will be nil
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
now := time.Now()
feed.LastCrawledAt = now
// 304 Not Modified - feed hasn't changed
if resp.StatusCode == http.StatusNotModified {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, nil
}
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
feedType := c.detectFeedType(body)
feed.Type = feedType
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
}
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
// Save items
if len(items) > 0 {
c.saveItems(items)
}
return true, nil
}
// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
// If status is 'pass', the account handle is also set (auto-derived if empty)
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
@@ -1099,39 +563,3 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
return scanFeeds(rows)
}
// GetUnpublishedItems returns items for a feed that haven't been published yet
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1 AND published_at IS NULL
ORDER BY pub_date ASC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// MarkItemPublished marks an item as published with the given URI
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
_, err := c.db.Exec(`
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
`, uri, itemID)
return err
}
// GetUnpublishedItemCount returns the count of unpublished items for a feed
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
var count int
err := c.db.QueryRow(`
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
`, feedURL).Scan(&count)
return count, err
}
+256
View File
@@ -0,0 +1,256 @@
package main
import (
"fmt"
"io"
"net/http"
"sync/atomic"
"time"
)
// processFeed parses and stores a feed with full metadata
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
feedType := c.detectFeedType(body)
now := time.Now()
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
ETag: headers.Get("ETag"),
LastModified: headers.Get("Last-Modified"),
}
// Parse feed-specific metadata and items
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
}
// Refine category based on parsed title (e.g., "Comments on:")
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
// Calculate next crawl time
feed.NextCrawlAt = c.calculateNextCrawl(feed)
if err := c.saveFeed(feed); err != nil {
return
}
// Save items
if len(items) > 0 {
c.saveItems(items)
}
}
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
now := time.Now()
normalizedURL := normalizeURL(feedURL)
feed := &Feed{
URL: normalizedURL,
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
Status: "active",
SourceURL: normalizeURL(sourceURL),
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
NextCrawlAt: now, // Should be crawled immediately
}
if err := c.saveFeed(feed); err != nil {
return
}
}
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: changed (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
var successURL string
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
successURL = tryURL
break
}
}
_ = successURL // May be used later for logging/debugging
// If no request succeeded, resp will be nil
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
now := time.Now()
feed.LastCrawledAt = now
// 304 Not Modified - feed hasn't changed
if resp.StatusCode == http.StatusNotModified {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, nil
}
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
feedType := c.detectFeedType(body)
feed.Type = feedType
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
}
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
// Save items
if len(items) > 0 {
c.saveItems(items)
}
return true, nil
}
+262
View File
@@ -0,0 +1,262 @@
package main
import (
"net/url"
"regexp"
"strings"
)
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {domain}-{category}.1440.news
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
//
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18 // PDS limit for first segment
// Ensure we have a scheme for parsing
if !strings.Contains(feedURL, "://") {
feedURL = "https://" + feedURL
}
u, err := url.Parse(feedURL)
if err != nil {
return ""
}
hostname := strings.ToLower(u.Hostname())
path := strings.ToLower(u.Path)
// Remove common feed suffixes/extensions
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
for _, suffix := range suffixesToRemove {
path = strings.TrimSuffix(path, suffix)
}
// Split path into segments and filter noise
segments := strings.Split(strings.Trim(path, "/"), "/")
skipPathWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true,
}
var pathParts []string
for _, seg := range segments {
seg = cleanHandleSegment(seg)
if seg != "" && !skipPathWords[seg] {
pathParts = append(pathParts, seg)
}
}
// Split hostname and extract the meaningful domain
hostParts := strings.Split(hostname, ".")
// Two-part TLDs to handle specially
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
}
// Check for two-part TLD
if len(hostParts) >= 2 {
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
if twoPartTLDs[possibleTwoPartTLD] {
hostParts = hostParts[:len(hostParts)-2]
} else {
// Single TLD - remove it
singleTLDs := map[string]bool{
"com": true, "org": true, "net": true, "io": true,
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
}
if singleTLDs[hostParts[len(hostParts)-1]] {
hostParts = hostParts[:len(hostParts)-1]
}
}
}
// Skip noise subdomains
skipHostWords := map[string]bool{
"www": true, "feeds": true, "rss": true, "feed": true,
"api": true, "cdn": true, "static": true, "news": true,
}
var meaningfulHostParts []string
for _, part := range hostParts {
if !skipHostWords[part] && part != "" {
meaningfulHostParts = append(meaningfulHostParts, part)
}
}
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
var mainDomain string
if len(meaningfulHostParts) > 0 {
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
} else if len(hostParts) > 0 {
mainDomain = hostParts[len(hostParts)-1]
}
// Special case: "bbci" should become "bbc"
if mainDomain == "bbci" {
mainDomain = "bbc"
}
// Abbreviations for long category names to fit 18-char limit
categoryAbbrevs := map[string]string{
"science-and-environment": "sci-env",
"entertainment-and-arts": "ent-arts",
"science-environment": "sci-env",
"entertainment-arts": "ent-arts",
"technology": "tech",
"business": "biz",
"international": "intl",
"environment": "env",
"entertainment": "ent",
"politics": "pol",
}
// Build subdomain: domain + category (from path)
var subdomain string
if len(pathParts) > 0 {
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
category := pathParts[len(pathParts)-1]
// Skip generic categories
if category == "news" && len(pathParts) == 1 {
subdomain = mainDomain
} else {
// Try to abbreviate if the full subdomain would be too long
fullSubdomain := mainDomain + "-" + category
if len(fullSubdomain) > maxSubdomainLen {
if abbrev, ok := categoryAbbrevs[category]; ok {
category = abbrev
}
}
subdomain = mainDomain + "-" + category
}
} else {
subdomain = mainDomain
}
// If still too long, just use main hostname
if len(subdomain) > maxSubdomainLen {
subdomain = mainDomain
}
// Final safety: truncate if still too long
if len(subdomain) > maxSubdomainLen {
subdomain = subdomain[:maxSubdomainLen]
}
subdomain = strings.Trim(subdomain, "-")
// Collapse multiple hyphens
for strings.Contains(subdomain, "--") {
subdomain = strings.ReplaceAll(subdomain, "--", "-")
}
return subdomain + ".1440.news"
}
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
// Remove file extensions
if idx := strings.LastIndex(s, "."); idx > 0 {
s = s[:idx]
}
// Convert to lowercase
s = strings.ToLower(s)
// Strip common feed prefixes/suffixes from the segment itself
// e.g., "showrss" → "show", "rssworld" → "world"
feedAffixes := []string{"rss", "feed", "atom", "xml"}
for _, affix := range feedAffixes {
// Strip suffix (e.g., "showrss" → "show")
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
s = strings.TrimSuffix(s, affix)
break
}
// Strip prefix (e.g., "rssworld" → "world")
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
s = strings.TrimPrefix(s, affix)
break
}
}
// Replace underscores and other separators with hyphens
s = strings.ReplaceAll(s, "_", "-")
s = strings.ReplaceAll(s, " ", "-")
// Remove any characters that aren't alphanumeric or hyphens
reg := regexp.MustCompile(`[^a-z0-9-]`)
s = reg.ReplaceAllString(s, "")
// Collapse multiple hyphens
for strings.Contains(s, "--") {
s = strings.ReplaceAll(s, "--", "-")
}
// Trim leading/trailing hyphens
s = strings.Trim(s, "-")
return s
}
// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
// Remove .1440.news suffix
handle = strings.TrimSuffix(handle, ".1440.news")
parts := strings.Split(handle, ".")
// Try to find where hostname starts by looking for valid hostname patterns
if len(parts) >= 2 {
for i := 0; i < len(parts)-1; i++ {
remaining := strings.Join(parts[i:], ".")
if looksLikeHostname(remaining) {
if i > 0 {
prefix = strings.Join(parts[:i], ".")
}
hostname = remaining
return
}
}
}
// Fallback: no prefix, entire thing is hostname
hostname = handle
return "", hostname
}
func isLikelyTLDPart(s string) bool {
tlds := map[string]bool{
"com": true, "org": true, "net": true, "edu": true, "gov": true,
"io": true, "co": true, "uk": true, "de": true, "fr": true,
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
"news": true, "blog": true, "tech": true, "dev": true,
}
return tlds[s]
}
func isTwoPartTLD(first, second string) bool {
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
"org.uk": true, "net.au": true, "com.br": true,
}
return twoPartTLDs[first+"."+second]
}
func looksLikeHostname(s string) bool {
// A hostname typically has at least one dot and ends with a TLD-like part
parts := strings.Split(s, ".")
if len(parts) < 2 {
return false
}
lastPart := parts[len(parts)-1]
return isLikelyTLDPart(lastPart)
}
+381
View File
@@ -0,0 +1,381 @@
package main
import (
"bytes"
"fmt"
"image"
_ "image/gif"
"image/jpeg"
_ "image/png"
"io"
"net/http"
"net/url"
"strings"
"time"
"go.deanishe.net/favicon"
"golang.org/x/image/draw"
_ "golang.org/x/image/webp"
)
// ImageUploadResult contains the uploaded blob and image dimensions
type ImageUploadResult struct {
Blob *BlobRef
Width int
Height int
}
// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
var images []BskyImage
maxImages := 4
if len(imageURLs) < maxImages {
maxImages = len(imageURLs)
}
for i := 0; i < maxImages; i++ {
result := p.fetchAndUploadImageWithDimensions(session, imageURLs[i])
if result != nil && result.Blob != nil {
img := BskyImage{
Alt: altText,
Image: result.Blob,
}
if result.Width > 0 && result.Height > 0 {
img.AspectRatio = &BskyAspectRatio{
Width: result.Width,
Height: result.Height,
}
}
images = append(images, img)
}
}
return images
}
// FetchFavicon tries to get a favicon URL for a site
// Uses go.deanishe.net/favicon library which parses HTML, manifests, and checks common paths
// Returns the favicon URL or empty string if not found
func (p *Publisher) FetchFavicon(siteURL string) string {
if siteURL == "" {
return ""
}
// Ensure URL has scheme
if !strings.Contains(siteURL, "://") {
siteURL = "https://" + siteURL
}
u, err := url.Parse(siteURL)
if err != nil {
return ""
}
// Create finder with custom HTTP client
// Note: Don't use IgnoreNoSize as it filters out valid favicon.ico files that don't have size metadata
finder := favicon.New(
favicon.WithClient(p.httpClient),
)
// Find icons - library checks HTML <link> tags, manifests, OG images, common paths
icons, err := finder.Find(siteURL)
if err == nil && len(icons) > 0 {
// Filter and score icons for avatar use
// Prefer: square icons, reasonable size, PNG format, actual favicons over OG images
var bestIcon string
var bestScore int
for _, icon := range icons {
// Skip tiny icons (likely tracking pixels)
if icon.Width > 0 && icon.Width < 32 {
continue
}
// Skip Open Graph images (meant for link previews, usually wide banners)
lowerURL := strings.ToLower(icon.URL)
if strings.Contains(lowerURL, "og-image") || strings.Contains(lowerURL, "og_image") ||
strings.Contains(lowerURL, "opengraph") || strings.Contains(lowerURL, "twitter") {
continue
}
// Skip wide images (aspect ratio > 1.5 means it's a banner, not an icon)
if icon.Width > 0 && icon.Height > 0 {
ratio := float64(icon.Width) / float64(icon.Height)
if ratio > 1.5 || ratio < 0.67 {
continue
}
}
// Score the icon
score := 0
// Prefer actual favicon paths
if strings.Contains(lowerURL, "favicon") || strings.Contains(lowerURL, "icon") ||
strings.Contains(lowerURL, "apple-touch") {
score += 100
}
// Prefer PNG over other formats
if icon.MimeType == "image/png" {
score += 50
} else if icon.MimeType == "image/x-icon" || strings.HasSuffix(lowerURL, ".ico") {
score += 40
} else if icon.MimeType == "image/jpeg" {
score += 10 // JPEG less preferred for icons
}
// Prefer larger icons (but not too large)
if icon.Width >= 64 && icon.Width <= 512 {
score += 30
} else if icon.Width > 0 {
score += 10
}
if score > bestScore {
bestScore = score
bestIcon = icon.URL
}
}
if bestIcon != "" {
return bestIcon
}
// Fall back to first non-OG icon
for _, icon := range icons {
lowerURL := strings.ToLower(icon.URL)
if !strings.Contains(lowerURL, "og-image") && !strings.Contains(lowerURL, "og_image") {
return icon.URL
}
}
}
// Fallback to Google's favicon service (reliable, returns PNG)
return fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
}
func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
result := p.fetchAndUploadImageWithDimensions(session, imageURL)
if result == nil {
return nil
}
return result.Blob
}
// upgradeImageURL attempts to get a larger version of known CDN image URLs
func upgradeImageURL(imageURL string) string {
// BBC images: /standard/240/ -> /standard/800/
if strings.Contains(imageURL, "ichef.bbci.co.uk") {
imageURL = strings.Replace(imageURL, "/standard/240/", "/standard/800/", 1)
imageURL = strings.Replace(imageURL, "/standard/480/", "/standard/800/", 1)
}
return imageURL
}
func (p *Publisher) fetchAndUploadImageWithDimensions(session *PDSSession, imageURL string) *ImageUploadResult {
// Upgrade image URL to larger size if possible
imageURL = upgradeImageURL(imageURL)
// Fetch the image
resp, err := p.httpClient.Get(imageURL)
if err != nil {
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil
}
// Check content type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
// Try to guess from URL
if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
contentType = "image/png"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
contentType = "image/gif"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
contentType = "image/webp"
} else {
contentType = "image/jpeg" // Default
}
}
// Only accept image types
if !strings.HasPrefix(contentType, "image/") {
return nil
}
// Read image data (limit to 2MB to allow for resize headroom)
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil || len(data) == 0 {
return nil
}
// Decode image to get dimensions
imgConfig, _, err := image.DecodeConfig(bytes.NewReader(data))
width, height := 1, 1 // Default if decode fails
if err == nil {
width, height = imgConfig.Width, imgConfig.Height
}
// Bluesky blob limit is ~976KB, use 900KB as safe threshold
const maxBlobSize = 900 * 1024
// If image is too large, resize it
if len(data) > maxBlobSize {
// Decode the full image for resizing
img, _, err := image.Decode(bytes.NewReader(data))
if err != nil {
return nil // Can't decode, can't resize
}
// Scale down iteratively until under limit
scaleFactor := 0.9 // Start with 90% and iterate if needed
for attempt := 0; attempt < 5; attempt++ {
newWidth := int(float64(width) * scaleFactor)
newHeight := int(float64(height) * scaleFactor)
// Minimum dimensions
if newWidth < 100 {
newWidth = 100
}
if newHeight < 100 {
newHeight = 100
}
// Create resized image
resized := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
draw.CatmullRom.Scale(resized, resized.Bounds(), img, img.Bounds(), draw.Over, nil)
// Encode as JPEG
var buf bytes.Buffer
if err := jpeg.Encode(&buf, resized, &jpeg.Options{Quality: 85}); err != nil {
return nil
}
if buf.Len() <= maxBlobSize {
data = buf.Bytes()
width = newWidth
height = newHeight
contentType = "image/jpeg"
break
}
// Still too large, reduce scale further
scaleFactor *= 0.8
}
// If still too large after 5 attempts, give up
if len(data) > maxBlobSize {
return nil
}
}
// Upload to PDS
blob, err := p.UploadBlob(session, data, contentType)
if err != nil {
return nil
}
return &ImageUploadResult{
Blob: blob,
Width: width,
Height: height,
}
}
// FetchFavicon downloads a favicon/icon from a URL
// Uses go.deanishe.net/favicon library to find the best icon
// Returns the favicon URL or empty string if not found
func FetchFaviconBytes(siteURL string) ([]byte, string, error) {
if !strings.HasPrefix(siteURL, "http") {
siteURL = "https://" + siteURL
}
u, err := url.Parse(siteURL)
if err != nil {
return nil, "", err
}
client := &http.Client{Timeout: 10 * time.Second}
// Use favicon library to find icons
finder := favicon.New(
favicon.WithClient(client),
favicon.IgnoreNoSize,
)
icons, err := finder.Find(siteURL)
if err != nil || len(icons) == 0 {
// Fallback to Google's favicon service
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
return fetchIconBytes(client, googleURL)
}
// Try icons in order (sorted by size, largest first)
// Prefer PNG/JPEG over ICO
var iconURLs []string
for _, icon := range icons {
if icon.Width > 0 && icon.Width < 32 {
continue // Skip tiny icons
}
if icon.MimeType == "image/png" || icon.MimeType == "image/jpeg" {
iconURLs = append([]string{icon.URL}, iconURLs...) // Prepend PNG/JPEG
} else {
iconURLs = append(iconURLs, icon.URL)
}
}
// If no good icons, use all of them
if len(iconURLs) == 0 {
for _, icon := range icons {
iconURLs = append(iconURLs, icon.URL)
}
}
// Try to download each icon
for _, iconURL := range iconURLs {
data, mimeType, err := fetchIconBytes(client, iconURL)
if err == nil && len(data) > 0 {
return data, mimeType, nil
}
}
// Final fallback to Google
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
return fetchIconBytes(client, googleURL)
}
// fetchIconBytes downloads an icon and returns its bytes and mime type
func fetchIconBytes(client *http.Client, iconURL string) ([]byte, string, error) {
resp, err := client.Get(iconURL)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", err
}
// Determine mime type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
if strings.HasSuffix(iconURL, ".png") {
contentType = "image/png"
} else if strings.HasSuffix(iconURL, ".ico") {
contentType = "image/x-icon"
} else {
contentType = "image/png"
}
}
return data, contentType, nil
}
+328
View File
@@ -0,0 +1,328 @@
package main
import (
"context"
"encoding/json"
"time"
"github.com/jackc/pgx/v5"
)
// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
URL string `json:"url"`
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
Length int64 `json:"length"` // Size in bytes
}
// Item represents an individual entry/article from a feed
type Item struct {
ID int64 `json:"id,omitempty"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
PublishedUri string `json:"published_uri,omitempty"`
}
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
func (c *Crawler) saveItem(item *Item) error {
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
var enclosureLength *int64
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
if item.Enclosure.Length > 0 {
enclosureLength = &item.Enclosure.Length
}
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := c.db.Exec(`
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
description = EXCLUDED.description,
content = EXCLUDED.content,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
updated_at = EXCLUDED.updated_at,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback(context.Background())
for _, item := range items {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
var enclosureLength *int64
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
if item.Enclosure.Length > 0 {
enclosureLength = &item.Enclosure.Length
}
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := tx.Exec(context.Background(), `
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
description = EXCLUDED.description,
content = EXCLUDED.content,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
updated_at = EXCLUDED.updated_at,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit(context.Background())
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1
ORDER BY pub_date DESC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
tsquery := ToSearchQuery(query)
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE search_vector @@ to_tsquery('english', $1)
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
LIMIT $2
`, tsquery, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// scanItems is a helper to scan multiple item rows
func scanItems(rows pgx.Rows) ([]*Item, error) {
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author *string
var pubDate, updatedAt, publishedAt *time.Time
var enclosureUrl, enclosureType *string
var enclosureLength *int64
var imageUrlsJSON, tagsJSON *string
var publishedUri *string
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
&publishedAt, &publishedUri,
); err != nil {
continue
}
item.GUID = StringValue(guid)
item.Title = StringValue(title)
item.Link = StringValue(link)
item.Description = StringValue(description)
item.Content = StringValue(content)
item.Author = StringValue(author)
item.PubDate = TimeValue(pubDate)
item.UpdatedAt = TimeValue(updatedAt)
// Parse enclosure
if enclosureUrl != nil && *enclosureUrl != "" {
item.Enclosure = &Enclosure{
URL: *enclosureUrl,
Type: StringValue(enclosureType),
}
if enclosureLength != nil {
item.Enclosure.Length = *enclosureLength
}
}
// Parse imageUrls JSON
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
var urls []string
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
item.ImageURLs = urls
}
}
// Parse tags JSON
if tagsJSON != nil && *tagsJSON != "" {
var tags []string
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
item.Tags = tags
}
}
item.PublishedAt = TimeValue(publishedAt)
item.PublishedUri = StringValue(publishedUri)
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
`, cutoff)
if err != nil {
return 0, err
}
return result, nil
}
// GetUnpublishedItems returns items for a feed that haven't been published yet
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1 AND published_at IS NULL
ORDER BY pub_date ASC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// MarkItemPublished marks an item as published with the given URI
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
_, err := c.db.Exec(`
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
`, uri, itemID)
return err
}
// GetUnpublishedItemCount returns the count of unpublished items for a feed
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
var count int
err := c.db.QueryRow(`
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
`, feedURL).Scan(&count)
return count, err
}
+187
View File
@@ -0,0 +1,187 @@
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"time"
)
// CreateSession authenticates with the PDS and returns a session
func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
payload := map[string]string{
"identifier": handle,
"password": password,
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createSession",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
return nil, err
}
return &session, nil
}
// CreateAccount creates a new account on the PDS
// Requires an invite code if the PDS has invites enabled
func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
payload := map[string]interface{}{
"handle": handle,
"email": email,
"password": password,
}
if inviteCode != "" {
payload["inviteCode"] = inviteCode
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createAccount",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.Unmarshal(respBody, &session); err != nil {
return nil, err
}
return &session, nil
}
// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
payload := map[string]interface{}{
"useCount": useCount,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
// PDS admin APIs use Basic Auth with "admin" as username
req.SetBasicAuth("admin", adminPassword)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Code string `json:"code"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.Code, nil
}
// FollowAccount creates a follow record from the authenticated session to the target DID
func (p *Publisher) FollowAccount(session *PDSSession, targetDID string) error {
// Create follow record
now := time.Now().UTC().Format(time.RFC3339)
record := map[string]interface{}{
"$type": "app.bsky.graph.follow",
"subject": targetDID,
"createdAt": now,
}
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.graph.follow",
"record": record,
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("follow failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// FollowAsDirectory logs in as the directory account and follows the target DID
func (p *Publisher) FollowAsDirectory(targetDID string) error {
dirHandle := os.Getenv("DIRECTORY_HANDLE")
dirPassword := os.Getenv("DIRECTORY_PASSWORD")
if dirHandle == "" || dirPassword == "" {
// Silently skip if directory account not configured
return nil
}
session, err := p.CreateSession(dirHandle, dirPassword)
if err != nil {
return fmt.Errorf("directory login failed: %w", err)
}
return p.FollowAccount(session, targetDID)
}
+272
View File
@@ -0,0 +1,272 @@
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
)
// BlobRef represents a blob reference for profile images
type BlobRef struct {
Type string `json:"$type"`
Ref Link `json:"ref"`
MimeType string `json:"mimeType"`
Size int64 `json:"size"`
}
type Link struct {
Link string `json:"$link"`
}
// UploadBlob uploads an image to the PDS and returns a blob reference
func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", mimeType)
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Blob BlobRef `json:"blob"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, err
}
return &result.Blob, nil
}
// UpdateProfile updates the profile for an account
func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
// First, get the current profile to preserve any existing fields
getReq, err := http.NewRequest("GET",
p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
nil)
if err != nil {
return err
}
getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
getResp, err := p.httpClient.Do(getReq)
var existingCID string
profile := map[string]interface{}{
"$type": "app.bsky.actor.profile",
}
if err == nil && getResp.StatusCode == http.StatusOK {
defer getResp.Body.Close()
var existing struct {
CID string `json:"cid"`
Value map[string]interface{} `json:"value"`
}
if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
existingCID = existing.CID
profile = existing.Value
}
} else if getResp != nil {
getResp.Body.Close()
}
// Update fields
if displayName != "" {
profile["displayName"] = displayName
}
if description != "" {
profile["description"] = description
}
if avatar != nil {
profile["avatar"] = avatar
}
// Put the record
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.actor.profile",
"rkey": "self",
"record": profile,
}
if existingCID != "" {
payload["swapRecord"] = existingCID
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// DeleteAllPosts deletes all posts from an account
func (p *Publisher) DeleteAllPosts(session *PDSSession) (int, error) {
deleted := 0
cursor := ""
for {
// List records
listURL := fmt.Sprintf("%s/xrpc/com.atproto.repo.listRecords?repo=%s&collection=app.bsky.feed.post&limit=100",
p.pdsHost, session.DID)
if cursor != "" {
listURL += "&cursor=" + url.QueryEscape(cursor)
}
req, err := http.NewRequest("GET", listURL, nil)
if err != nil {
return deleted, err
}
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return deleted, err
}
var result struct {
Records []struct {
URI string `json:"uri"`
} `json:"records"`
Cursor string `json:"cursor"`
}
respBody, _ := io.ReadAll(resp.Body)
resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return deleted, fmt.Errorf("list records failed: %s - %s", resp.Status, string(respBody))
}
if err := json.Unmarshal(respBody, &result); err != nil {
return deleted, err
}
if len(result.Records) == 0 {
break
}
// Delete each record
for _, record := range result.Records {
// Extract rkey from URI: at://did:plc:xxx/app.bsky.feed.post/rkey
parts := strings.Split(record.URI, "/")
if len(parts) < 2 {
continue
}
rkey := parts[len(parts)-1]
if err := p.DeleteRecord(session, "app.bsky.feed.post", rkey); err != nil {
// Continue deleting other records even if one fails
continue
}
deleted++
}
cursor = result.Cursor
if cursor == "" {
break
}
}
return deleted, nil
}
// DeleteRecord deletes a single record from an account
func (p *Publisher) DeleteRecord(session *PDSSession, collection, rkey string) error {
payload := map[string]interface{}{
"repo": session.DID,
"collection": collection,
"rkey": rkey,
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.deleteRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("delete record failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// DeleteAccount deletes an account using PDS admin API
func (p *Publisher) DeleteAccount(adminPassword, did string) error {
payload := map[string]interface{}{
"did": did,
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.admin.deleteAccount", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.SetBasicAuth("admin", adminPassword)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("delete account failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
-1063
View File
File diff suppressed because it is too large Load Diff
+160
View File
@@ -0,0 +1,160 @@
package main
import (
"fmt"
"net/http"
"strings"
)
func (c *Crawler) StartDashboard(addr string) error {
http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
c.handleDashboard(w, r)
})
// Root handler for url.1440.news short URLs and 1440.news accounts directory
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
host := r.Host
// Strip port if present
if idx := strings.Index(host, ":"); idx != -1 {
host = host[:idx]
}
// If this is url.1440.news, treat path as short code
if host == "url.1440.news" || host == "url.1440.localhost" {
c.handleRedirect(w, r)
return
}
// If this is 1440.news (apex), serve accounts directory
if host == "1440.news" || host == "1440.localhost" {
if r.URL.Path == "/" || r.URL.Path == "" {
c.handleAccountsDirectory(w, r)
return
}
}
// Otherwise, redirect to dashboard for root path
if r.URL.Path == "/" {
http.Redirect(w, r, "/dashboard", http.StatusFound)
return
}
// Unknown path
http.NotFound(w, r)
})
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIStats(w, r)
})
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIAllDomains(w, r)
})
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomainFeeds(w, r)
})
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedInfo(w, r)
})
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedItems(w, r)
})
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISearch(w, r)
})
http.HandleFunc("/api/tlds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPITLDs(w, r)
})
http.HandleFunc("/api/tldDomains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPITLDDomains(w, r)
})
http.HandleFunc("/api/revisitDomain", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIRevisitDomain(w, r)
})
http.HandleFunc("/api/priorityCrawl", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPriorityCrawl(w, r)
})
http.HandleFunc("/api/checkFeed", func(w http.ResponseWriter, r *http.Request) {
c.handleAPICheckFeed(w, r)
})
http.HandleFunc("/api/domainsByStatus", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomainsByStatus(w, r)
})
http.HandleFunc("/api/feedsByStatus", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeedsByStatus(w, r)
})
http.HandleFunc("/api/domains", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDomains(w, r)
})
http.HandleFunc("/api/feeds", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFeeds(w, r)
})
http.HandleFunc("/api/setDomainStatus", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISetDomainStatus(w, r)
})
http.HandleFunc("/api/filter", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIFilter(w, r)
})
http.HandleFunc("/api/enablePublish", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIEnablePublish(w, r)
})
http.HandleFunc("/api/disablePublish", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDisablePublish(w, r)
})
http.HandleFunc("/api/publishEnabled", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPublishEnabled(w, r)
})
http.HandleFunc("/api/publishDenied", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPublishDenied(w, r)
})
http.HandleFunc("/api/publishCandidates", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPublishCandidates(w, r)
})
http.HandleFunc("/api/setPublishStatus", func(w http.ResponseWriter, r *http.Request) {
c.handleAPISetPublishStatus(w, r)
})
http.HandleFunc("/api/unpublishedItems", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIUnpublishedItems(w, r)
})
http.HandleFunc("/api/testPublish", func(w http.ResponseWriter, r *http.Request) {
c.handleAPITestPublish(w, r)
})
http.HandleFunc("/api/deriveHandle", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDeriveHandle(w, r)
})
http.HandleFunc("/api/publishFeed", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPublishFeed(w, r)
})
http.HandleFunc("/api/createAccount", func(w http.ResponseWriter, r *http.Request) {
c.handleAPICreateAccount(w, r)
})
http.HandleFunc("/api/publishFeedFull", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIPublishFeedFull(w, r)
})
http.HandleFunc("/api/updateProfile", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIUpdateProfile(w, r)
})
http.HandleFunc("/api/languages", func(w http.ResponseWriter, r *http.Request) {
c.handleAPILanguages(w, r)
})
http.HandleFunc("/api/denyDomain", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIDenyDomain(w, r)
})
http.HandleFunc("/api/undenyDomain", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIUndenyDomain(w, r)
})
http.HandleFunc("/api/tldStats", func(w http.ResponseWriter, r *http.Request) {
c.handleAPITLDStats(w, r)
})
http.HandleFunc("/api/resetAllPublishing", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIResetAllPublishing(w, r)
})
http.HandleFunc("/api/refreshProfiles", func(w http.ResponseWriter, r *http.Request) {
c.handleAPIRefreshProfiles(w, r)
})
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
})
fmt.Printf("Dashboard running at http://%s\n", addr)
return http.ListenAndServe(addr, nil)
}
+75 -29
View File
@@ -435,7 +435,7 @@ function initDashboard() {
// Insert TLD header if TLD changed
if (d.tld && d.tld !== currentTLD) {
currentTLD = d.tld;
container.insertAdjacentHTML('beforeend', `<div class="tld-header" style="padding: 12px 10px 6px; color: #666; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'};">.${escapeHtml(currentTLD)}</div>`);
container.insertAdjacentHTML('beforeend', `<div class="tld-header" data-tld="${escapeHtml(currentTLD)}" style="padding: 12px 10px 6px; color: #888; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'}; cursor: pointer;"><span style="margin-right: 6px; font-size: 0.8em;">▼</span>.${escapeHtml(currentTLD)}</div>`);
}
container.insertAdjacentHTML('beforeend', renderDomainRow(d));
});
@@ -556,7 +556,7 @@ function initDashboard() {
// Insert TLD header if TLD changed
if (d.tld && d.tld !== currentTLD) {
currentTLD = d.tld;
container.insertAdjacentHTML('beforeend', `<div class="tld-header" style="padding: 12px 10px 6px; color: #666; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'};">.${escapeHtml(currentTLD)}</div>`);
container.insertAdjacentHTML('beforeend', `<div class="tld-header" data-tld="${escapeHtml(currentTLD)}" style="padding: 12px 10px 6px; color: #888; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'}; cursor: pointer;"><span style="margin-right: 6px; font-size: 0.8em;">▼</span>.${escapeHtml(currentTLD)}</div>`);
}
container.insertAdjacentHTML('beforeend', renderDomainRow(d));
});
@@ -883,11 +883,11 @@ function initDashboard() {
const isSelected = selectedTLDs.has(t.tld);
const bg = isSelected ? '#036' : '#1a1a1a';
const border = isSelected ? '#06c' : '#333';
const color = isSelected ? '#0af' : '#666';
const color = isSelected ? '#0af' : '#fff';
html += `<button class="tld-btn" data-tld="${escapeHtml(t.tld)}"
style="padding: 4px 8px; font-size: 11px; font-family: monospace;
background: ${bg}; border: 1px solid ${border}; border-radius: 3px;
color: ${color}; cursor: pointer;">.${escapeHtml(t.tld)} <span style="color: #555;">(${t.domain_count})</span></button>`;
color: ${color}; cursor: pointer;">.${escapeHtml(t.tld)} <span style="color: #888;">(${t.domain_count})</span></button>`;
});
// Add clear button if any selected
@@ -923,6 +923,9 @@ function initDashboard() {
}
}
// Track manually collapsed TLDs (collapsed by clicking header)
let collapsedTLDs = new Set();
function applyTLDFilter() {
const container = document.querySelector('.domain-list');
if (!container) return;
@@ -930,44 +933,87 @@ function initDashboard() {
const showAll = selectedTLDs.size === 0;
// Handle TLD headers and their domain blocks
container.querySelectorAll('.tld-header:not(.tld-handled)').forEach(header => {
header.classList.add('tld-handled');
header.style.cursor = 'pointer';
header.addEventListener('click', () => {
const tldText = header.dataset.tld;
// If TLD filter is active, clicking adds/removes from filter
if (selectedTLDs.size > 0) {
if (selectedTLDs.has(tldText)) {
selectedTLDs.delete(tldText);
} else {
selectedTLDs.add(tldText);
}
renderTLDButtons();
} else {
// No filter active - toggle manual collapse
if (collapsedTLDs.has(tldText)) {
collapsedTLDs.delete(tldText);
} else {
collapsedTLDs.add(tldText);
}
}
applyTLDVisibility();
});
});
// Store TLD in data attribute for headers that don't have it
container.querySelectorAll('.tld-header').forEach(header => {
const tldText = header.textContent.trim().substring(1); // Remove leading dot
const isSelected = showAll || selectedTLDs.has(tldText);
if (!header.dataset.tld) {
const match = header.textContent.trim().match(/^\.(.+?)(?:\s|$)/);
if (match) {
header.dataset.tld = match[1];
}
}
});
applyTLDVisibility();
}
function applyTLDVisibility() {
const container = document.querySelector('.domain-list');
if (!container) return;
const showAll = selectedTLDs.size === 0;
container.querySelectorAll('.tld-header').forEach(header => {
const tldText = header.dataset.tld;
if (!tldText) return;
// Determine if this TLD should be expanded
let isExpanded;
if (selectedTLDs.size > 0) {
// Filter mode: show only selected TLDs
isExpanded = selectedTLDs.has(tldText);
} else {
// No filter: show all except manually collapsed
isExpanded = !collapsedTLDs.has(tldText);
}
// Find all domain blocks until next TLD header
let nextEl = header.nextElementSibling;
while (nextEl && !nextEl.classList.contains('tld-header')) {
if (nextEl.classList.contains('domain-block')) {
nextEl.style.display = isSelected ? 'block' : 'none';
nextEl.style.display = isExpanded ? 'block' : 'none';
}
nextEl = nextEl.nextElementSibling;
}
// Style header based on selection
if (isSelected) {
header.style.color = '#888';
header.style.cursor = 'default';
// Style header based on expanded state
header.style.color = isExpanded ? '#888' : '#555';
// Add expand/collapse indicator
const indicator = isExpanded ? '▼' : '▶';
const tldDisplay = '.' + tldText;
if (!header.textContent.includes('▼') && !header.textContent.includes('▶')) {
header.innerHTML = `<span style="margin-right: 6px; font-size: 0.8em;">${indicator}</span>${tldDisplay}`;
} else {
header.style.color = '#444';
header.style.cursor = 'pointer';
header.innerHTML = header.innerHTML.replace(/[▼▶]/, indicator);
}
});
// Make collapsed headers clickable to expand
container.querySelectorAll('.tld-header').forEach(header => {
// Remove old listener by cloning
const newHeader = header.cloneNode(true);
header.parentNode.replaceChild(newHeader, header);
newHeader.addEventListener('click', () => {
const tldText = newHeader.textContent.trim().substring(1);
if (!selectedTLDs.has(tldText) && selectedTLDs.size > 0) {
selectedTLDs.add(tldText);
renderTLDButtons();
applyTLDFilter();
}
});
});
}
// Stats update
+540
View File
@@ -0,0 +1,540 @@
package main
import (
"encoding/json"
"fmt"
"html/template"
"io"
"net/http"
"os"
"strings"
"time"
)
// PDSAccount represents a Bluesky account on the PDS
type PDSAccount struct {
DID string `json:"did"`
Handle string `json:"handle"`
DisplayName string `json:"displayName"`
Description string `json:"description"`
Avatar string `json:"avatar"`
}
// handleAccountsDirectory serves the 1440.news accounts directory page
func (c *Crawler) handleAccountsDirectory(w http.ResponseWriter, r *http.Request) {
pdsHost := os.Getenv("PDS_HOST")
if pdsHost == "" {
pdsHost = "https://pds.1440.news"
}
// Fetch all repos from PDS
listReposURL := pdsHost + "/xrpc/com.atproto.sync.listRepos?limit=1000"
resp, err := http.Get(listReposURL)
if err != nil {
http.Error(w, "Failed to fetch accounts: "+err.Error(), http.StatusInternalServerError)
return
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
var reposResp struct {
Repos []struct {
DID string `json:"did"`
Head string `json:"head"`
Active bool `json:"active"`
} `json:"repos"`
}
if err := json.Unmarshal(body, &reposResp); err != nil {
http.Error(w, "Failed to parse repos: "+err.Error(), http.StatusInternalServerError)
return
}
// Fetch profile for each account using unauthenticated endpoints
var accounts []PDSAccount
client := &http.Client{Timeout: 5 * time.Second}
for _, repo := range reposResp.Repos {
if !repo.Active {
continue
}
// Get handle using describeRepo
describeURL := pdsHost + "/xrpc/com.atproto.repo.describeRepo?repo=" + repo.DID
describeResp, err := client.Get(describeURL)
if err != nil {
continue
}
describeBody, _ := io.ReadAll(describeResp.Body)
describeResp.Body.Close()
var repoInfo struct {
Handle string `json:"handle"`
DID string `json:"did"`
}
if err := json.Unmarshal(describeBody, &repoInfo); err != nil {
continue
}
// Skip the main 1440.news account (directory account itself)
if repoInfo.Handle == "1440.news" {
continue
}
account := PDSAccount{
DID: repoInfo.DID,
Handle: repoInfo.Handle,
}
// Get profile record for display name, description, avatar
recordURL := pdsHost + "/xrpc/com.atproto.repo.getRecord?repo=" + repo.DID + "&collection=app.bsky.actor.profile&rkey=self"
recordResp, err := client.Get(recordURL)
if err == nil {
recordBody, _ := io.ReadAll(recordResp.Body)
recordResp.Body.Close()
var record struct {
Value struct {
DisplayName string `json:"displayName"`
Description string `json:"description"`
Avatar struct {
Ref struct {
Link string `json:"$link"`
} `json:"ref"`
} `json:"avatar"`
} `json:"value"`
}
if json.Unmarshal(recordBody, &record) == nil {
account.DisplayName = record.Value.DisplayName
account.Description = record.Value.Description
if record.Value.Avatar.Ref.Link != "" {
account.Avatar = pdsHost + "/xrpc/com.atproto.sync.getBlob?did=" + repo.DID + "&cid=" + record.Value.Avatar.Ref.Link
}
}
}
accounts = append(accounts, PDSAccount{
DID: account.DID,
Handle: account.Handle,
DisplayName: account.DisplayName,
Description: account.Description,
Avatar: account.Avatar,
})
}
// Render the page
tmpl := template.Must(template.New("accounts").Parse(accountsDirectoryHTML))
w.Header().Set("Content-Type", "text/html; charset=utf-8")
tmpl.Execute(w, map[string]interface{}{
"Accounts": accounts,
"Count": len(accounts),
})
}
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
funcMap := template.FuncMap{
"pct": func(a, b int) float64 {
if b == 0 {
return 0
}
return float64(a) * 100.0 / float64(b)
},
"comma": func(n interface{}) string {
var val int
switch v := n.(type) {
case int:
val = v
case int32:
val = int(v)
case int64:
val = int(v)
default:
return "0"
}
if val < 0 {
return "-" + commaFormat(-val)
}
return commaFormat(val)
},
}
tmpl, err := template.New("dashboard").Funcs(funcMap).Parse(dashboardHTML)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "text/html")
tmpl.Execute(w, stats)
}
func (c *Crawler) handleAPIStats(w http.ResponseWriter, r *http.Request) {
stats, err := c.GetDashboardStats()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
// handleRedirect handles short URL redirects for url.1440.news
func (c *Crawler) handleRedirect(w http.ResponseWriter, r *http.Request) {
code := strings.TrimPrefix(r.URL.Path, "/")
if code == "" {
http.NotFound(w, r)
return
}
// Look up the short URL
shortURL, err := c.GetShortURL(code)
if err != nil {
http.NotFound(w, r)
return
}
// Record the click asynchronously
go func() {
if err := c.RecordClick(code, r); err != nil {
fmt.Printf("Failed to record click for %s: %v\n", code, err)
}
}()
// Redirect to original URL
http.Redirect(w, r, shortURL.OriginalURL, http.StatusFound)
}
const accountsDirectoryHTML = `<!DOCTYPE html>
<html>
<head>
<title>1440.news - News Feed Directory</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: #0a0a0a;
color: #e0e0e0;
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 800px;
margin: 0 auto;
}
header {
text-align: center;
margin-bottom: 40px;
padding-bottom: 20px;
border-bottom: 1px solid #333;
}
h1 {
font-size: 2.5em;
color: #fff;
margin-bottom: 10px;
}
.tagline {
color: #888;
font-size: 1.1em;
}
.count {
color: #0af;
margin-top: 10px;
}
.accounts {
display: flex;
flex-direction: column;
gap: 15px;
}
.account {
display: flex;
align-items: center;
gap: 15px;
padding: 15px;
background: #151515;
border: 1px solid #252525;
border-radius: 12px;
transition: all 0.2s;
}
.account:hover {
border-color: #0af;
background: #1a1a1a;
}
.avatar {
width: 60px;
height: 60px;
border-radius: 50%;
background: #333;
flex-shrink: 0;
object-fit: cover;
}
.avatar-placeholder {
width: 60px;
height: 60px;
border-radius: 50%;
background: linear-gradient(135deg, #0af, #08f);
flex-shrink: 0;
display: flex;
align-items: center;
justify-content: center;
font-size: 24px;
color: #fff;
font-weight: bold;
}
.info {
flex: 1;
min-width: 0;
}
.display-name {
font-size: 1.1em;
font-weight: 600;
color: #fff;
margin-bottom: 2px;
}
.handle {
color: #0af;
font-size: 0.9em;
margin-bottom: 5px;
}
.description {
color: #888;
font-size: 0.85em;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.follow-btn {
padding: 8px 20px;
background: #0af;
color: #000;
border: none;
border-radius: 20px;
font-weight: 600;
cursor: pointer;
text-decoration: none;
flex-shrink: 0;
transition: all 0.2s;
}
.follow-btn:hover {
background: #0cf;
transform: scale(1.05);
}
footer {
margin-top: 40px;
padding-top: 20px;
border-top: 1px solid #333;
text-align: center;
color: #666;
font-size: 0.9em;
}
footer a {
color: #0af;
text-decoration: none;
}
.search-box {
margin-top: 20px;
}
.search-input {
width: 100%;
padding: 12px 20px;
font-size: 1em;
background: #151515;
border: 1px solid #333;
border-radius: 25px;
color: #fff;
outline: none;
transition: border-color 0.2s;
}
.search-input:focus {
border-color: #0af;
}
.search-input::placeholder {
color: #666;
}
.no-results {
text-align: center;
color: #666;
padding: 40px;
display: none;
}
.account.hidden {
display: none;
}
@media (max-width: 600px) {
.account {
flex-wrap: wrap;
}
.follow-btn {
width: 100%;
text-align: center;
margin-top: 10px;
}
}
</style>
</head>
<body>
<div class="container">
<header>
<h1>1440.news</h1>
<p class="tagline">Curated news feeds on Bluesky</p>
<p class="count"><span id="visibleCount">{{.Count}}</span> feeds available</p>
<div class="search-box">
<input type="text" class="search-input" id="searchInput" placeholder="Search feeds..." autocomplete="off">
</div>
</header>
<div class="accounts" id="accountsList">
{{range .Accounts}}
<div class="account">
{{if .Avatar}}
<img class="avatar" src="{{.Avatar}}" alt="{{.DisplayName}}">
{{else}}
<div class="avatar-placeholder">{{slice .Handle 0 1}}</div>
{{end}}
<div class="info">
<div class="display-name">{{if .DisplayName}}{{.DisplayName}}{{else}}{{.Handle}}{{end}}</div>
<div class="handle">@{{.Handle}}</div>
{{if .Description}}<div class="description">{{.Description}}</div>{{end}}
</div>
<a class="follow-btn" href="https://bsky.app/profile/{{.Handle}}" target="_blank">View</a>
</div>
{{else}}
<p style="text-align: center; color: #666;">No feeds available yet.</p>
{{end}}
</div>
<p class="no-results" id="noResults">No feeds match your search.</p>
<footer>
<p>Follow <a href="https://bsky.app/profile/1440.news" target="_blank">@1440.news</a> for updates</p>
</footer>
</div>
<script>
document.getElementById('searchInput').addEventListener('input', function() {
const query = this.value.toLowerCase().trim();
const accounts = document.querySelectorAll('.account');
let visibleCount = 0;
accounts.forEach(function(account) {
const text = account.textContent.toLowerCase();
if (query === '' || text.includes(query)) {
account.classList.remove('hidden');
visibleCount++;
} else {
account.classList.add('hidden');
}
});
document.getElementById('visibleCount').textContent = visibleCount;
document.getElementById('noResults').style.display = visibleCount === 0 && query !== '' ? 'block' : 'none';
});
</script>
</body>
</html>
`
const dashboardHTML = `<!DOCTYPE html>
<html>
<head>
<title>1440.news Feed Crawler</title>
<meta charset="utf-8">
<link rel="stylesheet" href="/static/dashboard.css">
<script src="/static/dashboard.js?v=37"></script>
</head>
<body>
<h1>1440.news Feed Crawler</h1>
<h2>Domain Status</h2>
<div class="grid">
<div class="card">
<div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
<div class="stat-label">Total</div>
</div>
<div class="card">
<div class="stat-value" id="holdDomains" style="color: #f90;">{{comma .HoldDomains}}</div>
<div class="stat-label">Hold</div>
</div>
<div class="card">
<div class="stat-value" id="passDomains" style="color: #0f0;">{{comma .PassDomains}}</div>
<div class="stat-label">Pass</div>
</div>
<div class="card">
<div class="stat-value" id="skipDomains" style="color: #f66;">{{comma .SkipDomains}}</div>
<div class="stat-label">Skip</div>
</div>
<div class="card">
<div class="stat-value" id="failDomains" style="color: #f00;">{{comma .FailDomains}}</div>
<div class="stat-label">Fail</div>
</div>
<div class="card">
<div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
<div class="stat-label">crawls/min</div>
</div>
<div class="card">
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
<div class="stat-label">checks/min</div>
</div>
</div>
<h2>Feeds Discovered</h2>
<div class="grid">
<div class="card">
<div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
<div class="stat-label">Total Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
<div class="stat-label">RSS Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
<div class="stat-label">Atom Feeds</div>
</div>
<div class="card">
<div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
<div class="stat-label">Unknown Type</div>
</div>
</div>
<div class="card" id="inputCard">
<div id="commandButtons" style="margin-bottom: 10px;">
<button class="cmd-btn" data-cmd="/domains">domains</button>
<button class="cmd-btn" data-cmd="/feeds">feeds</button>
<button class="cmd-btn" id="tldToggleBtn">tlds</button>
<span style="color: #333; margin: 0 4px;">|</span>
<button class="cmd-btn" data-cmd="domains:hold">d:hold</button>
<button class="cmd-btn" data-cmd="domains:pass">d:pass</button>
<button class="cmd-btn" data-cmd="domains:skip">d:skip</button>
<button class="cmd-btn" data-cmd="domains:fail">d:fail</button>
<span style="color: #333; margin: 0 4px;">|</span>
<button class="cmd-btn" data-cmd="feeds:hold">f:hold</button>
<button class="cmd-btn" data-cmd="feeds:pass">f:pass</button>
<button class="cmd-btn" data-cmd="feeds:skip">f:skip</button>
<button class="cmd-btn" data-cmd="feeds:fail">f:fail</button>
</div>
<div id="langDropdown" style="display: none; margin-bottom: 10px; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; max-height: 200px; overflow-y: auto;">
<div id="langList"></div>
</div>
<div id="tldDropdown" style="display: none; margin-bottom: 10px; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px;">
<div id="tldList" style="display: flex; flex-wrap: wrap; gap: 6px;"></div>
</div>
<input type="text" id="commandInput" value="/help"
style="width: 100%; padding: 12px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px; font-family: monospace;">
</div>
<div class="card" id="outputCard">
<div id="breadcrumb" style="margin-bottom: 10px; display: none;"></div>
<div id="output"></div>
</div>
<div style="color: #333; font-size: 11px; margin-top: 10px;">v18</div>
<div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
</body>
</html>`