Refactor large Go files into focused modules
Split dashboard.go (3,528 lines) into: - routes.go: HTTP route registration - api_domains.go: Domain API handlers - api_feeds.go: Feed API handlers - api_publish.go: Publishing API handlers - api_search.go: Search API handlers - templates.go: HTML templates - dashboard.go: Stats functions only (235 lines) Split publisher.go (1,502 lines) into: - pds_auth.go: Authentication and account management - pds_records.go: Record operations (upload, update, delete) - handle.go: Handle derivation from feed URLs - image.go: Image processing and favicon fetching - publisher.go: Core types and PublishItem (439 lines) Split feed.go (1,137 lines) into: - item.go: Item struct and DB operations - feed_check.go: Feed checking and processing - feed.go: Feed struct and DB operations (565 lines) Also includes domain import batch size increase (1k -> 100k). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+764
@@ -0,0 +1,764 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
|
||||
offset := 0
|
||||
limit := 100
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 100 {
|
||||
limit = 100
|
||||
}
|
||||
}
|
||||
|
||||
// Serve from cache (updated once per minute in background)
|
||||
c.statsMu.RLock()
|
||||
cached := c.cachedAllDomains
|
||||
c.statsMu.RUnlock()
|
||||
|
||||
var domains []DomainStat
|
||||
if cached != nil && offset < len(cached) {
|
||||
end := offset + limit
|
||||
if end > len(cached) {
|
||||
end = len(cached)
|
||||
}
|
||||
domains = cached[offset:end]
|
||||
}
|
||||
if domains == nil {
|
||||
domains = []DomainStat{}
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(domains)
|
||||
}
|
||||
|
||||
// handleAPIDomains lists domains with optional status filter, including their feeds
|
||||
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
||||
status := r.URL.Query().Get("status")
|
||||
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
// First get domains
|
||||
var rows pgx.Rows
|
||||
var err error
|
||||
if hasFeeds {
|
||||
// Only domains with feeds
|
||||
if status != "" {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
WHERE d.status = $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, status, limit, offset)
|
||||
} else {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
||||
FROM domains d
|
||||
INNER JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $1 OFFSET $2
|
||||
`, limit, offset)
|
||||
}
|
||||
} else if status != "" {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
||||
FROM domains d
|
||||
LEFT JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
WHERE d.status = $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, status, limit, offset)
|
||||
} else {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
||||
FROM domains d
|
||||
LEFT JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $1 OFFSET $2
|
||||
`, limit, offset)
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type FeedInfo struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
PublishStatus string `json:"publish_status,omitempty"`
|
||||
}
|
||||
|
||||
type DomainInfo struct {
|
||||
Host string `json:"host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
FeedCount int `json:"feed_count"`
|
||||
Feeds []FeedInfo `json:"feeds,omitempty"`
|
||||
}
|
||||
|
||||
var domains []DomainInfo
|
||||
var hosts []string
|
||||
for rows.Next() {
|
||||
var d DomainInfo
|
||||
var tld, lastError *string
|
||||
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
||||
continue
|
||||
}
|
||||
d.TLD = StringValue(tld)
|
||||
d.LastError = StringValue(lastError)
|
||||
domains = append(domains, d)
|
||||
hosts = append(hosts, d.Host)
|
||||
}
|
||||
|
||||
// Now get feeds for these domains
|
||||
if len(hosts) > 0 {
|
||||
feedRows, err := c.db.Query(`
|
||||
SELECT source_host, url, title, type, status, publish_status
|
||||
FROM feeds
|
||||
WHERE source_host = ANY($1)
|
||||
ORDER BY source_host, url
|
||||
`, hosts)
|
||||
if err == nil {
|
||||
defer feedRows.Close()
|
||||
feedsByHost := make(map[string][]FeedInfo)
|
||||
for feedRows.Next() {
|
||||
var host string
|
||||
var f FeedInfo
|
||||
var title, feedType, status, publishStatus *string
|
||||
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus); err != nil {
|
||||
continue
|
||||
}
|
||||
f.Title = StringValue(title)
|
||||
f.Type = StringValue(feedType)
|
||||
f.Status = StringValue(status)
|
||||
f.PublishStatus = StringValue(publishStatus)
|
||||
feedsByHost[host] = append(feedsByHost[host], f)
|
||||
}
|
||||
// Attach feeds to domains
|
||||
for i := range domains {
|
||||
if feeds, ok := feedsByHost[domains[i].Host]; ok {
|
||||
domains[i].Feeds = feeds
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(domains)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
|
||||
status := r.URL.Query().Get("status")
|
||||
if status == "" {
|
||||
http.Error(w, "status parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
||||
FROM domains d
|
||||
LEFT JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
WHERE d.status = $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, status, limit, offset)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type DomainInfo struct {
|
||||
Host string `json:"host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
FeedCount int `json:"feed_count"`
|
||||
}
|
||||
|
||||
var domains []DomainInfo
|
||||
for rows.Next() {
|
||||
var d DomainInfo
|
||||
var tld, lastError *string
|
||||
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
||||
continue
|
||||
}
|
||||
d.TLD = StringValue(tld)
|
||||
d.LastError = StringValue(lastError)
|
||||
domains = append(domains, d)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(domains)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, type, status, error_count, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
WHERE source_host = $1
|
||||
ORDER BY url ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, host, limit, offset)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type FeedInfo struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Type string `json:"type"`
|
||||
Status string `json:"status,omitempty"`
|
||||
ErrorCount int `json:"error_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ItemCount int `json:"item_count,omitempty"`
|
||||
PublishStatus string `json:"publish_status,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
}
|
||||
|
||||
var feeds []FeedInfo
|
||||
for rows.Next() {
|
||||
var f FeedInfo
|
||||
var title, status, lastError, publishStatus, language *string
|
||||
var errorCount, itemCount *int
|
||||
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil {
|
||||
continue
|
||||
}
|
||||
f.Title = StringValue(title)
|
||||
f.Status = StringValue(status)
|
||||
f.LastError = StringValue(lastError)
|
||||
f.PublishStatus = StringValue(publishStatus)
|
||||
f.Language = StringValue(language)
|
||||
if errorCount != nil {
|
||||
f.ErrorCount = *errorCount
|
||||
}
|
||||
if itemCount != nil {
|
||||
f.ItemCount = *itemCount
|
||||
}
|
||||
feeds = append(feeds, f)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(feeds)
|
||||
}
|
||||
|
||||
// handleAPISetDomainStatus sets the status for a domain
|
||||
// status must be 'hold', 'pass', 'skip', or 'fail'
|
||||
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
status := r.URL.Query().Get("status")
|
||||
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if status != "hold" && status != "pass" && status != "skip" && status != "fail" {
|
||||
http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail'", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
host = normalizeHost(host)
|
||||
|
||||
// When setting to pass, clear any last_error
|
||||
var err error
|
||||
if status == "pass" {
|
||||
_, err = c.db.Exec(`
|
||||
UPDATE domains SET status = $1, last_error = NULL
|
||||
WHERE host = $2
|
||||
`, status, host)
|
||||
} else {
|
||||
_, err = c.db.Exec(`
|
||||
UPDATE domains SET status = $1
|
||||
WHERE host = $2
|
||||
`, status, host)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]string{
|
||||
"host": host,
|
||||
"status": status,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE domains SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
|
||||
WHERE host = $1
|
||||
`, host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
|
||||
}
|
||||
|
||||
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
|
||||
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
host = normalizeHost(host)
|
||||
|
||||
// Add domain if it doesn't exist, or reset to pass for crawling
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'pass', NOW(), $2)
|
||||
ON CONFLICT(host) DO UPDATE SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
|
||||
`, host, getTLD(host))
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Crawl synchronously
|
||||
fmt.Printf("Priority crawl: %s\n", host)
|
||||
feedsFound, crawlErr := c.crawlHost(host)
|
||||
|
||||
errStr := ""
|
||||
if crawlErr != nil {
|
||||
errStr = crawlErr.Error()
|
||||
}
|
||||
|
||||
// Mark as crawled
|
||||
c.markDomainCrawled(host, feedsFound, errStr)
|
||||
|
||||
// Get the feeds we found
|
||||
feeds, _ := c.GetFeedsByHost(host)
|
||||
|
||||
type FeedSummary struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Type string `json:"type"`
|
||||
Category string `json:"category"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
var feedSummaries []FeedSummary
|
||||
for _, f := range feeds {
|
||||
feedSummaries = append(feedSummaries, FeedSummary{
|
||||
URL: f.URL,
|
||||
Title: f.Title,
|
||||
Type: f.Type,
|
||||
Category: f.Category,
|
||||
Status: f.Status,
|
||||
})
|
||||
}
|
||||
|
||||
result := map[string]interface{}{
|
||||
"host": host,
|
||||
"feeds_found": feedsFound,
|
||||
"feeds": feedSummaries,
|
||||
}
|
||||
if crawlErr != nil {
|
||||
result["error"] = crawlErr.Error()
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(result)
|
||||
}
|
||||
|
||||
// handleAPIFilter handles flexible filtering with stackable parameters
|
||||
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
|
||||
tld := r.URL.Query().Get("tld")
|
||||
domain := r.URL.Query().Get("domain")
|
||||
feedStatus := r.URL.Query().Get("feedStatus")
|
||||
domainStatus := r.URL.Query().Get("domainStatus")
|
||||
languages := r.URL.Query().Get("languages") // comma-separated list
|
||||
show := r.URL.Query().Get("show") // "feeds" or "domains"
|
||||
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
|
||||
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
// Parse languages into slice
|
||||
var langList []string
|
||||
if languages != "" {
|
||||
for _, lang := range strings.Split(languages, ",") {
|
||||
lang = strings.TrimSpace(lang)
|
||||
if lang != "" {
|
||||
langList = append(langList, lang)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine what to show based on filters
|
||||
if show == "" {
|
||||
if feedStatus != "" || domain != "" || len(langList) > 0 {
|
||||
show = "feeds"
|
||||
} else {
|
||||
show = "domains"
|
||||
}
|
||||
}
|
||||
|
||||
if show == "feeds" {
|
||||
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
|
||||
} else {
|
||||
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
query := `
|
||||
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
||||
FROM domains d
|
||||
LEFT JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
WHERE 1=1`
|
||||
|
||||
if tld != "" {
|
||||
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
||||
args = append(args, tld)
|
||||
argNum++
|
||||
}
|
||||
if status != "" {
|
||||
query += fmt.Sprintf(" AND d.status = $%d", argNum)
|
||||
args = append(args, status)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Sort by feed count descending or alphabetically
|
||||
if sort == "feeds" {
|
||||
query += fmt.Sprintf(" ORDER BY feed_count DESC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
||||
} else {
|
||||
query += fmt.Sprintf(" ORDER BY d.tld ASC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
||||
}
|
||||
args = append(args, limit, offset)
|
||||
|
||||
rows, err := c.db.Query(query, args...)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type DomainInfo struct {
|
||||
Host string `json:"host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
FeedCount int `json:"feed_count"`
|
||||
}
|
||||
|
||||
var domains []DomainInfo
|
||||
for rows.Next() {
|
||||
var d DomainInfo
|
||||
var tldVal, lastError *string
|
||||
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
|
||||
continue
|
||||
}
|
||||
d.TLD = StringValue(tldVal)
|
||||
d.LastError = StringValue(lastError)
|
||||
domains = append(domains, d)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"type": "domains",
|
||||
"data": domains,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
|
||||
tld := r.URL.Query().Get("tld")
|
||||
if tld == "" {
|
||||
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT d.host, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
||||
FROM domains d
|
||||
LEFT JOIN (
|
||||
SELECT source_host, COUNT(*) as feed_count
|
||||
FROM feeds
|
||||
GROUP BY source_host
|
||||
) f ON d.host = f.source_host
|
||||
WHERE d.tld = $1
|
||||
ORDER BY d.tld ASC, d.host ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, tld, limit, offset)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type DomainInfo struct {
|
||||
Host string `json:"host"`
|
||||
Status string `json:"status"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
FeedCount int `json:"feed_count"`
|
||||
}
|
||||
|
||||
var domains []DomainInfo
|
||||
for rows.Next() {
|
||||
var d DomainInfo
|
||||
var lastError *string
|
||||
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
|
||||
continue
|
||||
}
|
||||
d.LastError = StringValue(lastError)
|
||||
domains = append(domains, d)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(domains)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
||||
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
|
||||
|
||||
var rows pgx.Rows
|
||||
var err error
|
||||
|
||||
if hasFeeds {
|
||||
// Only TLDs that have domains with feeds
|
||||
rows, err = c.db.Query(`
|
||||
SELECT DISTINCT d.tld, COUNT(DISTINCT d.host) as domain_count
|
||||
FROM domains d
|
||||
INNER JOIN feeds f ON d.host = f.source_host
|
||||
WHERE d.tld IS NOT NULL AND d.tld != ''
|
||||
GROUP BY d.tld
|
||||
ORDER BY d.tld ASC
|
||||
`)
|
||||
} else {
|
||||
// All TLDs
|
||||
rows, err = c.db.Query(`
|
||||
SELECT tld, COUNT(*) as domain_count
|
||||
FROM domains
|
||||
WHERE tld IS NOT NULL AND tld != ''
|
||||
GROUP BY tld
|
||||
ORDER BY tld ASC
|
||||
`)
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type TLDInfo struct {
|
||||
TLD string `json:"tld"`
|
||||
DomainCount int `json:"domain_count"`
|
||||
}
|
||||
|
||||
var tlds []TLDInfo
|
||||
for rows.Next() {
|
||||
var t TLDInfo
|
||||
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
|
||||
continue
|
||||
}
|
||||
tlds = append(tlds, t)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(tlds)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
|
||||
tld := r.URL.Query().Get("tld")
|
||||
if tld == "" {
|
||||
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
var domainCount, feedCount int
|
||||
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE tld = $1`, tld).Scan(&domainCount)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE tld = $1`, tld).Scan(&feedCount)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"tld": tld,
|
||||
"domain_count": domainCount,
|
||||
"feed_count": feedCount,
|
||||
})
|
||||
}
|
||||
|
||||
// handleAPIDenyDomain skips a domain and all its feeds
|
||||
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Update domain status to skip
|
||||
_, err := c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Skip all feeds from this domain
|
||||
feedsAffected, err := c.db.Exec(`UPDATE feeds SET publish_status = 'skip', status = 'dead' WHERE source_host = $1`, host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"success": true,
|
||||
"host": host,
|
||||
"feeds_skipped": feedsAffected,
|
||||
})
|
||||
}
|
||||
|
||||
// handleAPIUndenyDomain removes skip status from a domain
|
||||
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
if host == "" {
|
||||
http.Error(w, "host parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Update domain status back to pass
|
||||
_, err := c.db.Exec(`UPDATE domains SET status = 'pass' WHERE host = $1 AND status = 'skip'`, host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Restore feeds to hold status and active
|
||||
feedsRestored, err := c.db.Exec(`UPDATE feeds SET publish_status = 'hold', status = 'active' WHERE source_host = $1 AND status = 'dead'`, host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"success": true,
|
||||
"host": host,
|
||||
"feeds_restored": feedsRestored,
|
||||
})
|
||||
}
|
||||
+504
@@ -0,0 +1,504 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
func (c *Crawler) handleAPIFeedInfo(w http.ResponseWriter, r *http.Request) {
|
||||
feedURL := r.URL.Query().Get("url")
|
||||
if feedURL == "" {
|
||||
http.Error(w, "url parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
type FeedDetails struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
SiteURL string `json:"siteUrl,omitempty"`
|
||||
DiscoveredAt string `json:"discoveredAt,omitempty"`
|
||||
LastCrawledAt string `json:"lastCrawledAt,omitempty"`
|
||||
LastBuildDate string `json:"lastBuildDate,omitempty"`
|
||||
TTLMinutes int `json:"ttlMinutes,omitempty"`
|
||||
UpdatePeriod string `json:"updatePeriod,omitempty"`
|
||||
UpdateFreq int `json:"updateFreq,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
ErrorCount int `json:"errorCount,omitempty"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
ItemCount int `json:"itemCount,omitempty"`
|
||||
AvgPostFreqHrs float64 `json:"avgPostFreqHrs,omitempty"`
|
||||
OldestItemDate string `json:"oldestItemDate,omitempty"`
|
||||
NewestItemDate string `json:"newestItemDate,omitempty"`
|
||||
PublishStatus string `json:"publishStatus,omitempty"`
|
||||
PublishAccount string `json:"publishAccount,omitempty"`
|
||||
}
|
||||
|
||||
var f FeedDetails
|
||||
var title, description, language, siteUrl *string
|
||||
var lastCrawledAt, lastBuildDate *time.Time
|
||||
var updatePeriod, status, lastError *string
|
||||
var oldestItemDate, newestItemDate *time.Time
|
||||
var ttlMinutes, updateFreq, errorCount, itemCount *int
|
||||
var avgPostFreqHrs *float64
|
||||
var discoveredAt time.Time
|
||||
var publishStatus, publishAccount *string
|
||||
|
||||
err := c.db.QueryRow(`
|
||||
SELECT url, type, title, description, language, site_url,
|
||||
discovered_at, last_crawled_at, last_build_date,
|
||||
ttl_minutes, update_period, update_freq,
|
||||
status, error_count, last_error,
|
||||
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date,
|
||||
publish_status, publish_account
|
||||
FROM feeds WHERE url = $1
|
||||
`, feedURL).Scan(
|
||||
&f.URL, &f.Type, &title, &description, &language, &siteUrl,
|
||||
&discoveredAt, &lastCrawledAt, &lastBuildDate,
|
||||
&ttlMinutes, &updatePeriod, &updateFreq,
|
||||
&status, &errorCount, &lastError,
|
||||
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&publishStatus, &publishAccount,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
http.Error(w, "feed not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
f.Title = StringValue(title)
|
||||
f.Description = StringValue(description)
|
||||
f.Language = StringValue(language)
|
||||
f.SiteURL = StringValue(siteUrl)
|
||||
f.DiscoveredAt = discoveredAt.Format(time.RFC3339)
|
||||
if lastCrawledAt != nil {
|
||||
f.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
|
||||
}
|
||||
if lastBuildDate != nil {
|
||||
f.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
||||
}
|
||||
if ttlMinutes != nil {
|
||||
f.TTLMinutes = *ttlMinutes
|
||||
}
|
||||
f.UpdatePeriod = StringValue(updatePeriod)
|
||||
if updateFreq != nil {
|
||||
f.UpdateFreq = *updateFreq
|
||||
}
|
||||
f.Status = StringValue(status)
|
||||
if errorCount != nil {
|
||||
f.ErrorCount = *errorCount
|
||||
}
|
||||
f.LastError = StringValue(lastError)
|
||||
if itemCount != nil {
|
||||
f.ItemCount = *itemCount
|
||||
}
|
||||
if avgPostFreqHrs != nil {
|
||||
f.AvgPostFreqHrs = *avgPostFreqHrs
|
||||
}
|
||||
if oldestItemDate != nil {
|
||||
f.OldestItemDate = oldestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
if newestItemDate != nil {
|
||||
f.NewestItemDate = newestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
f.PublishStatus = StringValue(publishStatus)
|
||||
f.PublishAccount = StringValue(publishAccount)
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(f)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIFeedItems(w http.ResponseWriter, r *http.Request) {
|
||||
feedURL := r.URL.Query().Get("url")
|
||||
if feedURL == "" {
|
||||
http.Error(w, "url parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 50
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 100 {
|
||||
limit = 100
|
||||
}
|
||||
}
|
||||
|
||||
items, err := c.GetItemsByFeed(feedURL, limit)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if items == nil {
|
||||
items = []*Item{}
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(items)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIFeedsByStatus(w http.ResponseWriter, r *http.Request) {
|
||||
status := r.URL.Query().Get("status")
|
||||
if status == "" {
|
||||
http.Error(w, "status parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count
|
||||
FROM feeds
|
||||
WHERE status = $1
|
||||
ORDER BY url ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, status, limit, offset)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type FeedInfo struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Type string `json:"type"`
|
||||
SourceHost string `json:"source_host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
ErrorCount int `json:"error_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ItemCount int `json:"item_count,omitempty"`
|
||||
}
|
||||
|
||||
var feeds []FeedInfo
|
||||
for rows.Next() {
|
||||
var f FeedInfo
|
||||
var title, sourceHost, tld, lastError *string
|
||||
var errorCount, itemCount *int
|
||||
if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount); err != nil {
|
||||
continue
|
||||
}
|
||||
f.Title = StringValue(title)
|
||||
f.SourceHost = StringValue(sourceHost)
|
||||
f.TLD = StringValue(tld)
|
||||
f.LastError = StringValue(lastError)
|
||||
if errorCount != nil {
|
||||
f.ErrorCount = *errorCount
|
||||
}
|
||||
if itemCount != nil {
|
||||
f.ItemCount = *itemCount
|
||||
}
|
||||
feeds = append(feeds, f)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(feeds)
|
||||
}
|
||||
|
||||
// handleAPIFeeds lists feeds with optional publish_status filter
|
||||
func (c *Crawler) handleAPIFeeds(w http.ResponseWriter, r *http.Request) {
|
||||
publishStatus := r.URL.Query().Get("publish_status")
|
||||
limit := 100
|
||||
offset := 0
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
if o := r.URL.Query().Get("offset"); o != "" {
|
||||
fmt.Sscanf(o, "%d", &offset)
|
||||
}
|
||||
|
||||
var rows pgx.Rows
|
||||
var err error
|
||||
if publishStatus != "" {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
WHERE publish_status = $1
|
||||
ORDER BY url ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, publishStatus, limit, offset)
|
||||
} else {
|
||||
rows, err = c.db.Query(`
|
||||
SELECT url, title, type, source_host, tld, status, error_count, last_error, item_count, publish_status, language
|
||||
FROM feeds
|
||||
ORDER BY url ASC
|
||||
LIMIT $1 OFFSET $2
|
||||
`, limit, offset)
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type FeedInfo struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Type string `json:"type"`
|
||||
SourceHost string `json:"source_host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
ErrorCount int `json:"error_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ItemCount int `json:"item_count,omitempty"`
|
||||
PublishStatus string `json:"publish_status,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
}
|
||||
|
||||
var feeds []FeedInfo
|
||||
for rows.Next() {
|
||||
var f FeedInfo
|
||||
var title, sourceHost, tld, lastError, publishStatus, language *string
|
||||
var errorCount, itemCount *int
|
||||
if err := rows.Scan(&f.URL, &title, &f.Type, &sourceHost, &tld, &f.Status, &errorCount, &lastError, &itemCount, &publishStatus, &language); err != nil {
|
||||
continue
|
||||
}
|
||||
f.Title = StringValue(title)
|
||||
f.SourceHost = StringValue(sourceHost)
|
||||
f.TLD = StringValue(tld)
|
||||
f.LastError = StringValue(lastError)
|
||||
f.PublishStatus = StringValue(publishStatus)
|
||||
f.Language = StringValue(language)
|
||||
if errorCount != nil {
|
||||
f.ErrorCount = *errorCount
|
||||
}
|
||||
if itemCount != nil {
|
||||
f.ItemCount = *itemCount
|
||||
}
|
||||
feeds = append(feeds, f)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(feeds)
|
||||
}
|
||||
|
||||
func (c *Crawler) filterFeeds(w http.ResponseWriter, tld, domain, status string, languages []string, limit, offset int) {
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
query := `
|
||||
SELECT url, title, type, category, source_host, tld, status, error_count, last_error, item_count, language
|
||||
FROM feeds
|
||||
WHERE 1=1`
|
||||
|
||||
if tld != "" {
|
||||
query += fmt.Sprintf(" AND tld = $%d", argNum)
|
||||
args = append(args, tld)
|
||||
argNum++
|
||||
}
|
||||
if domain != "" {
|
||||
query += fmt.Sprintf(" AND source_host = $%d", argNum)
|
||||
args = append(args, domain)
|
||||
argNum++
|
||||
}
|
||||
if status != "" {
|
||||
query += fmt.Sprintf(" AND status = $%d", argNum)
|
||||
args = append(args, status)
|
||||
argNum++
|
||||
}
|
||||
if len(languages) > 0 {
|
||||
// Build IN clause for languages, handling 'unknown' as empty string
|
||||
placeholders := make([]string, len(languages))
|
||||
for i, lang := range languages {
|
||||
placeholders[i] = fmt.Sprintf("$%d", argNum)
|
||||
if lang == "unknown" {
|
||||
args = append(args, "")
|
||||
} else {
|
||||
args = append(args, lang)
|
||||
}
|
||||
argNum++
|
||||
}
|
||||
query += fmt.Sprintf(" AND COALESCE(language, '') IN (%s)", strings.Join(placeholders, ","))
|
||||
}
|
||||
|
||||
query += fmt.Sprintf(" ORDER BY url ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
||||
args = append(args, limit, offset)
|
||||
|
||||
rows, err := c.db.Query(query, args...)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type FeedInfo struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Category string `json:"category"`
|
||||
SourceHost string `json:"source_host"`
|
||||
TLD string `json:"tld"`
|
||||
Status string `json:"status"`
|
||||
ErrorCount int `json:"error_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ItemCount int `json:"item_count,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
}
|
||||
|
||||
var feeds []FeedInfo
|
||||
for rows.Next() {
|
||||
var f FeedInfo
|
||||
var title, category, sourceHost, tldVal, lastError, language *string
|
||||
var errorCount, itemCount *int
|
||||
if err := rows.Scan(&f.URL, &title, &f.Type, &category, &sourceHost, &tldVal, &f.Status, &errorCount, &lastError, &itemCount, &language); err != nil {
|
||||
continue
|
||||
}
|
||||
f.Title = StringValue(title)
|
||||
if category != nil && *category != "" {
|
||||
f.Category = *category
|
||||
} else {
|
||||
f.Category = "main"
|
||||
}
|
||||
f.SourceHost = StringValue(sourceHost)
|
||||
f.TLD = StringValue(tldVal)
|
||||
f.LastError = StringValue(lastError)
|
||||
if errorCount != nil {
|
||||
f.ErrorCount = *errorCount
|
||||
}
|
||||
if itemCount != nil {
|
||||
f.ItemCount = *itemCount
|
||||
}
|
||||
f.Language = StringValue(language)
|
||||
feeds = append(feeds, f)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"type": "feeds",
|
||||
"data": feeds,
|
||||
})
|
||||
}
|
||||
|
||||
// handleAPICheckFeed immediately checks a feed and returns items
|
||||
func (c *Crawler) handleAPICheckFeed(w http.ResponseWriter, r *http.Request) {
|
||||
feedURL := r.URL.Query().Get("url")
|
||||
if feedURL == "" {
|
||||
http.Error(w, "url parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
force := r.URL.Query().Get("force") == "true"
|
||||
|
||||
feedURL = normalizeURL(feedURL)
|
||||
|
||||
// Get the feed
|
||||
feed, err := c.getFeed(feedURL)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if feed == nil {
|
||||
http.Error(w, "feed not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
// Clear cache headers if force is requested
|
||||
if force {
|
||||
feed.ETag = ""
|
||||
feed.LastModified = ""
|
||||
}
|
||||
|
||||
// Force check the feed
|
||||
fmt.Printf("Force check feed: %s (force=%v)\n", feedURL, force)
|
||||
changed, checkErr := c.CheckFeed(feed)
|
||||
|
||||
// Get updated feed info
|
||||
feed, _ = c.getFeed(feedURL)
|
||||
|
||||
// Get items
|
||||
items, _ := c.GetItemsByFeed(feedURL, 20)
|
||||
|
||||
type ItemSummary struct {
|
||||
Title string `json:"title"`
|
||||
Link string `json:"link"`
|
||||
PubDate string `json:"pub_date,omitempty"`
|
||||
Author string `json:"author,omitempty"`
|
||||
}
|
||||
var itemSummaries []ItemSummary
|
||||
for _, item := range items {
|
||||
is := ItemSummary{
|
||||
Title: item.Title,
|
||||
Link: item.Link,
|
||||
Author: item.Author,
|
||||
}
|
||||
if !item.PubDate.IsZero() {
|
||||
is.PubDate = item.PubDate.Format("2006-01-02 15:04")
|
||||
}
|
||||
itemSummaries = append(itemSummaries, is)
|
||||
}
|
||||
|
||||
result := map[string]interface{}{
|
||||
"url": feedURL,
|
||||
"title": feed.Title,
|
||||
"type": feed.Type,
|
||||
"category": feed.Category,
|
||||
"status": feed.Status,
|
||||
"changed": changed,
|
||||
"itemCount": feed.ItemCount,
|
||||
"items": itemSummaries,
|
||||
}
|
||||
if checkErr != nil {
|
||||
result["error"] = checkErr.Error()
|
||||
}
|
||||
if feed.LastError != "" {
|
||||
result["lastError"] = feed.LastError
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(result)
|
||||
}
|
||||
|
||||
// handleAPILanguages returns distinct languages with counts
|
||||
func (c *Crawler) handleAPILanguages(w http.ResponseWriter, r *http.Request) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT COALESCE(NULLIF(language, ''), 'unknown') as lang, COUNT(*) as cnt
|
||||
FROM feeds
|
||||
GROUP BY lang
|
||||
ORDER BY cnt DESC
|
||||
`)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type LangInfo struct {
|
||||
Language string `json:"language"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
var languages []LangInfo
|
||||
for rows.Next() {
|
||||
var l LangInfo
|
||||
if err := rows.Scan(&l.Language, &l.Count); err != nil {
|
||||
continue
|
||||
}
|
||||
languages = append(languages, l)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(languages)
|
||||
}
|
||||
+1026
File diff suppressed because it is too large
Load Diff
+349
@@ -0,0 +1,349 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
// SearchResult represents a search result with feed and matching items
|
||||
type SearchResult struct {
|
||||
Feed SearchFeed `json:"feed"`
|
||||
Items []SearchItem `json:"items"`
|
||||
}
|
||||
|
||||
type SearchFeed struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"`
|
||||
Category string `json:"category"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Language string `json:"language"`
|
||||
SiteURL string `json:"site_url"`
|
||||
DiscoveredAt string `json:"discovered_at"`
|
||||
LastCrawledAt string `json:"last_crawled_at"`
|
||||
NextCrawlAt string `json:"next_crawl_at"`
|
||||
LastBuildDate string `json:"last_build_date"`
|
||||
TTLMinutes int `json:"ttl_minutes"`
|
||||
UpdatePeriod string `json:"update_period"`
|
||||
UpdateFreq int `json:"update_freq"`
|
||||
Status string `json:"status"`
|
||||
ErrorCount int `json:"error_count"`
|
||||
LastError string `json:"last_error"`
|
||||
LastErrorAt string `json:"last_error_at"`
|
||||
SourceURL string `json:"source_url"`
|
||||
SourceHost string `json:"source_host"`
|
||||
TLD string `json:"tld"`
|
||||
ItemCount int `json:"item_count"`
|
||||
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs"`
|
||||
OldestItemDate string `json:"oldest_item_date"`
|
||||
NewestItemDate string `json:"newest_item_date"`
|
||||
NoUpdate bool `json:"no_update"`
|
||||
}
|
||||
|
||||
type SearchItem struct {
|
||||
ID int64 `json:"id"`
|
||||
FeedURL string `json:"feed_url"`
|
||||
GUID string `json:"guid"`
|
||||
Title string `json:"title"`
|
||||
Link string `json:"link"`
|
||||
Description string `json:"description"`
|
||||
Content string `json:"content"`
|
||||
Author string `json:"author"`
|
||||
PubDate string `json:"pub_date"`
|
||||
DiscoveredAt string `json:"discovered_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPISearch(w http.ResponseWriter, r *http.Request) {
|
||||
query := r.URL.Query().Get("q")
|
||||
if query == "" {
|
||||
http.Error(w, "q parameter required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
if l := r.URL.Query().Get("limit"); l != "" {
|
||||
fmt.Sscanf(l, "%d", &limit)
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
}
|
||||
|
||||
// Results map: feedURL -> SearchResult
|
||||
results := make(map[string]*SearchResult)
|
||||
|
||||
// Helper to scan feed row into SearchFeed
|
||||
scanFeed := func(rows pgx.Rows) (string, SearchFeed, bool) {
|
||||
var url string
|
||||
var feedType, category, title, description, language, siteUrl *string
|
||||
var discoveredAt time.Time
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate *time.Time
|
||||
var ttlMinutes, updateFreq, errorCount, itemCount *int
|
||||
var updatePeriod, status, lastError *string
|
||||
var lastErrorAt *time.Time
|
||||
var sourceUrl, sourceHost, tld *string
|
||||
var avgPostFreqHrs *float64
|
||||
var oldestItemDate, newestItemDate *time.Time
|
||||
var noUpdate *bool
|
||||
|
||||
if err := rows.Scan(&url, &feedType, &category, &title, &description, &language, &siteUrl,
|
||||
&discoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&ttlMinutes, &updatePeriod, &updateFreq,
|
||||
&status, &errorCount, &lastError, &lastErrorAt,
|
||||
&sourceUrl, &sourceHost, &tld,
|
||||
&itemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate, &noUpdate); err != nil {
|
||||
return "", SearchFeed{}, false
|
||||
}
|
||||
cat := StringValue(category)
|
||||
if cat == "" {
|
||||
cat = "main"
|
||||
}
|
||||
sf := SearchFeed{
|
||||
URL: url,
|
||||
Type: StringValue(feedType),
|
||||
Category: cat,
|
||||
Title: StringValue(title),
|
||||
Description: StringValue(description),
|
||||
Language: StringValue(language),
|
||||
SiteURL: StringValue(siteUrl),
|
||||
DiscoveredAt: discoveredAt.Format(time.RFC3339),
|
||||
UpdatePeriod: StringValue(updatePeriod),
|
||||
Status: StringValue(status),
|
||||
LastError: StringValue(lastError),
|
||||
SourceURL: StringValue(sourceUrl),
|
||||
SourceHost: StringValue(sourceHost),
|
||||
TLD: StringValue(tld),
|
||||
}
|
||||
if lastCrawledAt != nil {
|
||||
sf.LastCrawledAt = lastCrawledAt.Format(time.RFC3339)
|
||||
}
|
||||
if nextCrawlAt != nil {
|
||||
sf.NextCrawlAt = nextCrawlAt.Format(time.RFC3339)
|
||||
}
|
||||
if lastBuildDate != nil {
|
||||
sf.LastBuildDate = lastBuildDate.Format(time.RFC3339)
|
||||
}
|
||||
if ttlMinutes != nil {
|
||||
sf.TTLMinutes = *ttlMinutes
|
||||
}
|
||||
if updateFreq != nil {
|
||||
sf.UpdateFreq = *updateFreq
|
||||
}
|
||||
if errorCount != nil {
|
||||
sf.ErrorCount = *errorCount
|
||||
}
|
||||
if lastErrorAt != nil {
|
||||
sf.LastErrorAt = lastErrorAt.Format(time.RFC3339)
|
||||
}
|
||||
if itemCount != nil {
|
||||
sf.ItemCount = *itemCount
|
||||
}
|
||||
if avgPostFreqHrs != nil {
|
||||
sf.AvgPostFreqHrs = *avgPostFreqHrs
|
||||
}
|
||||
if oldestItemDate != nil {
|
||||
sf.OldestItemDate = oldestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
if newestItemDate != nil {
|
||||
sf.NewestItemDate = newestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
if noUpdate != nil {
|
||||
sf.NoUpdate = *noUpdate
|
||||
}
|
||||
return url, sf, true
|
||||
}
|
||||
|
||||
// Search feeds by source_host (LIKE search for domain matching)
|
||||
hostRows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, site_url,
|
||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
||||
ttl_minutes, update_period, update_freq,
|
||||
status, error_count, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds
|
||||
WHERE source_host ILIKE $1 OR url ILIKE $1
|
||||
LIMIT $2
|
||||
`, "%"+query+"%", limit)
|
||||
if err == nil {
|
||||
defer hostRows.Close()
|
||||
for hostRows.Next() {
|
||||
if url, feed, ok := scanFeed(hostRows); ok {
|
||||
if _, exists := results[url]; !exists {
|
||||
results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search feeds via full-text search
|
||||
tsQuery := ToSearchQuery(query)
|
||||
feedRows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, site_url,
|
||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
||||
ttl_minutes, update_period, update_freq,
|
||||
status, error_count, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds
|
||||
WHERE search_vector @@ to_tsquery('english', $1)
|
||||
LIMIT $2
|
||||
`, tsQuery, limit)
|
||||
if err == nil {
|
||||
defer feedRows.Close()
|
||||
for feedRows.Next() {
|
||||
if url, feed, ok := scanFeed(feedRows); ok {
|
||||
if _, exists := results[url]; !exists {
|
||||
results[url] = &SearchResult{Feed: feed, Items: []SearchItem{}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search items via full-text search
|
||||
itemRows, err := c.db.Query(`
|
||||
SELECT i.id, i.feed_url, i.guid, i.title, i.link, i.description, i.content, i.author, i.pub_date, i.discovered_at, i.updated_at
|
||||
FROM items i
|
||||
WHERE i.search_vector @@ to_tsquery('english', $1)
|
||||
ORDER BY i.pub_date DESC
|
||||
LIMIT $2
|
||||
`, tsQuery, limit)
|
||||
if err == nil {
|
||||
defer itemRows.Close()
|
||||
for itemRows.Next() {
|
||||
var id int64
|
||||
var feedUrl string
|
||||
var guid, title, link, description, content, author *string
|
||||
var pubDate, discoveredAt, updatedAt *time.Time
|
||||
if err := itemRows.Scan(&id, &feedUrl, &guid, &title, &link, &description, &content, &author, &pubDate, &discoveredAt, &updatedAt); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
item := SearchItem{
|
||||
ID: id,
|
||||
FeedURL: feedUrl,
|
||||
GUID: StringValue(guid),
|
||||
Title: StringValue(title),
|
||||
Link: StringValue(link),
|
||||
Description: StringValue(description),
|
||||
Content: StringValue(content),
|
||||
Author: StringValue(author),
|
||||
}
|
||||
if pubDate != nil {
|
||||
item.PubDate = pubDate.Format(time.RFC3339)
|
||||
}
|
||||
if discoveredAt != nil {
|
||||
item.DiscoveredAt = discoveredAt.Format(time.RFC3339)
|
||||
}
|
||||
if updatedAt != nil {
|
||||
item.UpdatedAt = updatedAt.Format(time.RFC3339)
|
||||
}
|
||||
|
||||
// Add to existing result or create new one
|
||||
if result, exists := results[feedUrl]; exists {
|
||||
result.Items = append(result.Items, item)
|
||||
} else {
|
||||
// Fetch feed info for this item's feed
|
||||
var fType, fCategory, fTitle, fDesc, fLang, fSiteUrl *string
|
||||
var fDiscoveredAt time.Time
|
||||
var fLastCrawledAt, fNextCrawlAt, fLastBuildDate *time.Time
|
||||
var fTTLMinutes, fUpdateFreq, fErrorCount, fItemCount *int
|
||||
var fUpdatePeriod, fStatus, fLastError *string
|
||||
var fLastErrorAt *time.Time
|
||||
var fSourceUrl, fSourceHost, fTLD *string
|
||||
var fAvgPostFreqHrs *float64
|
||||
var fOldestItemDate, fNewestItemDate *time.Time
|
||||
var fNoUpdate *bool
|
||||
|
||||
c.db.QueryRow(`
|
||||
SELECT type, category, title, description, language, site_url,
|
||||
discovered_at, last_crawled_at, next_crawl_at, last_build_date,
|
||||
ttl_minutes, update_period, update_freq,
|
||||
status, error_count, last_error, last_error_at,
|
||||
source_url, source_host, tld,
|
||||
item_count, avg_post_freq_hrs, oldest_item_date, newest_item_date, no_update
|
||||
FROM feeds WHERE url = $1
|
||||
`, feedUrl).Scan(&fType, &fCategory, &fTitle, &fDesc, &fLang, &fSiteUrl,
|
||||
&fDiscoveredAt, &fLastCrawledAt, &fNextCrawlAt, &fLastBuildDate,
|
||||
&fTTLMinutes, &fUpdatePeriod, &fUpdateFreq,
|
||||
&fStatus, &fErrorCount, &fLastError, &fLastErrorAt,
|
||||
&fSourceUrl, &fSourceHost, &fTLD,
|
||||
&fItemCount, &fAvgPostFreqHrs, &fOldestItemDate, &fNewestItemDate, &fNoUpdate)
|
||||
|
||||
fCat := StringValue(fCategory)
|
||||
if fCat == "" {
|
||||
fCat = "main"
|
||||
}
|
||||
sf := SearchFeed{
|
||||
URL: feedUrl,
|
||||
Type: StringValue(fType),
|
||||
Category: fCat,
|
||||
Title: StringValue(fTitle),
|
||||
Description: StringValue(fDesc),
|
||||
Language: StringValue(fLang),
|
||||
SiteURL: StringValue(fSiteUrl),
|
||||
DiscoveredAt: fDiscoveredAt.Format(time.RFC3339),
|
||||
UpdatePeriod: StringValue(fUpdatePeriod),
|
||||
Status: StringValue(fStatus),
|
||||
LastError: StringValue(fLastError),
|
||||
SourceURL: StringValue(fSourceUrl),
|
||||
SourceHost: StringValue(fSourceHost),
|
||||
TLD: StringValue(fTLD),
|
||||
}
|
||||
if fLastCrawledAt != nil {
|
||||
sf.LastCrawledAt = fLastCrawledAt.Format(time.RFC3339)
|
||||
}
|
||||
if fNextCrawlAt != nil {
|
||||
sf.NextCrawlAt = fNextCrawlAt.Format(time.RFC3339)
|
||||
}
|
||||
if fLastBuildDate != nil {
|
||||
sf.LastBuildDate = fLastBuildDate.Format(time.RFC3339)
|
||||
}
|
||||
if fTTLMinutes != nil {
|
||||
sf.TTLMinutes = *fTTLMinutes
|
||||
}
|
||||
if fUpdateFreq != nil {
|
||||
sf.UpdateFreq = *fUpdateFreq
|
||||
}
|
||||
if fErrorCount != nil {
|
||||
sf.ErrorCount = *fErrorCount
|
||||
}
|
||||
if fLastErrorAt != nil {
|
||||
sf.LastErrorAt = fLastErrorAt.Format(time.RFC3339)
|
||||
}
|
||||
if fItemCount != nil {
|
||||
sf.ItemCount = *fItemCount
|
||||
}
|
||||
if fAvgPostFreqHrs != nil {
|
||||
sf.AvgPostFreqHrs = *fAvgPostFreqHrs
|
||||
}
|
||||
if fOldestItemDate != nil {
|
||||
sf.OldestItemDate = fOldestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
if fNewestItemDate != nil {
|
||||
sf.NewestItemDate = fNewestItemDate.Format(time.RFC3339)
|
||||
}
|
||||
if fNoUpdate != nil {
|
||||
sf.NoUpdate = *fNoUpdate
|
||||
}
|
||||
results[feedUrl] = &SearchResult{
|
||||
Feed: sf,
|
||||
Items: []SearchItem{item},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert map to slice
|
||||
var resultList []SearchResult
|
||||
for _, r := range results {
|
||||
resultList = append(resultList, *r)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(resultList)
|
||||
}
|
||||
-3293
File diff suppressed because it is too large
Load Diff
@@ -360,6 +360,8 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
totalImported += int(imported)
|
||||
atomic.AddInt32(&c.domainsImported, int32(imported))
|
||||
|
||||
fmt.Printf("Import batch %d: %d domains (total: %d)\n", batchCount, imported, totalImported)
|
||||
|
||||
// Wait 1 second before the next batch
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
@@ -1,15 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
@@ -95,37 +89,6 @@ func classifyFeedByTitle(title string, currentCategory string) string {
|
||||
return currentCategory
|
||||
}
|
||||
|
||||
// Enclosure represents a media attachment (audio, video, image)
|
||||
type Enclosure struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||
Length int64 `json:"length"` // Size in bytes
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
type Item struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
FeedURL string `json:"feed_url"`
|
||||
GUID string `json:"guid,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Link string `json:"link,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Author string `json:"author,omitempty"`
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
|
||||
// Media attachments
|
||||
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
|
||||
|
||||
// Publishing to PDS
|
||||
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||
PublishedUri string `json:"published_uri,omitempty"`
|
||||
}
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
@@ -537,505 +500,6 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
||||
return feeds, rows.Err()
|
||||
}
|
||||
|
||||
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveItems stores multiple items efficiently
|
||||
func (c *Crawler) saveItems(items []*Item) error {
|
||||
if len(items) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
tx, err := c.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback(context.Background())
|
||||
|
||||
for _, item := range items {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := tx.Exec(context.Background(), `
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit(context.Background())
|
||||
}
|
||||
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1
|
||||
ORDER BY pub_date DESC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
tsquery := ToSearchQuery(query)
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE search_vector @@ to_tsquery('english', $1)
|
||||
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
|
||||
LIMIT $2
|
||||
`, tsquery, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// scanItems is a helper to scan multiple item rows
|
||||
func scanItems(rows pgx.Rows) ([]*Item, error) {
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author *string
|
||||
var pubDate, updatedAt, publishedAt *time.Time
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
var imageUrlsJSON, tagsJSON *string
|
||||
var publishedUri *string
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
|
||||
&publishedAt, &publishedUri,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
item.GUID = StringValue(guid)
|
||||
item.Title = StringValue(title)
|
||||
item.Link = StringValue(link)
|
||||
item.Description = StringValue(description)
|
||||
item.Content = StringValue(content)
|
||||
item.Author = StringValue(author)
|
||||
item.PubDate = TimeValue(pubDate)
|
||||
item.UpdatedAt = TimeValue(updatedAt)
|
||||
|
||||
// Parse enclosure
|
||||
if enclosureUrl != nil && *enclosureUrl != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: *enclosureUrl,
|
||||
Type: StringValue(enclosureType),
|
||||
}
|
||||
if enclosureLength != nil {
|
||||
item.Enclosure.Length = *enclosureLength
|
||||
}
|
||||
}
|
||||
|
||||
// Parse imageUrls JSON
|
||||
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
|
||||
item.ImageURLs = urls
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tags JSON
|
||||
if tagsJSON != nil && *tagsJSON != "" {
|
||||
var tags []string
|
||||
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
|
||||
item.Tags = tags
|
||||
}
|
||||
}
|
||||
|
||||
item.PublishedAt = TimeValue(publishedAt)
|
||||
item.PublishedUri = StringValue(publishedUri)
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// CleanupOldItems removes items older than 12 months
|
||||
func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
||||
result, err := c.db.Exec(`
|
||||
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
|
||||
`, cutoff)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
feedType := c.detectFeedType(body)
|
||||
now := time.Now()
|
||||
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata and items
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
normalizedURL := normalizeURL(feedURL)
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
NextCrawlAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||
// Returns: changed (bool), error
|
||||
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
atomic.AddInt32(&c.feedsChecked, 1)
|
||||
|
||||
// Try different scheme/www combinations since we store URLs without scheme
|
||||
urlVariants := []string{
|
||||
"https://" + feed.URL,
|
||||
"http://" + feed.URL,
|
||||
"https://www." + feed.URL,
|
||||
"http://www." + feed.URL,
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
var err error
|
||||
var successURL string
|
||||
|
||||
for _, tryURL := range urlVariants {
|
||||
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||
if reqErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.UserAgent)
|
||||
|
||||
// Add conditional headers if we have them
|
||||
if feed.ETag != "" {
|
||||
req.Header.Set("If-None-Match", feed.ETag)
|
||||
}
|
||||
if feed.LastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||
}
|
||||
|
||||
resp, err = c.client.Do(req)
|
||||
if err == nil {
|
||||
successURL = tryURL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
_ = successURL // May be used later for logging/debugging
|
||||
|
||||
// If no request succeeded, resp will be nil
|
||||
if resp == nil {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("all URL variants failed")
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
|
||||
// 304 Not Modified - feed hasn't changed
|
||||
if resp.StatusCode == http.StatusNotModified {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
|
||||
body := string(bodyBytes)
|
||||
|
||||
// Update cache headers
|
||||
feed.ETag = resp.Header.Get("ETag")
|
||||
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||
|
||||
// Re-detect type and parse metadata
|
||||
feedType := c.detectFeedType(body)
|
||||
feed.Type = feedType
|
||||
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
|
||||
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
||||
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
||||
@@ -1099,39 +563,3 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
||||
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1 AND published_at IS NULL
|
||||
ORDER BY pub_date ASC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// MarkItemPublished marks an item as published with the given URI
|
||||
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
|
||||
`, uri, itemID)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
||||
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow(`
|
||||
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
|
||||
`, feedURL).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
+256
@@ -0,0 +1,256 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
feedType := c.detectFeedType(body)
|
||||
now := time.Now()
|
||||
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata and items
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
normalizedURL := normalizeURL(feedURL)
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
NextCrawlAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||
// Returns: changed (bool), error
|
||||
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
atomic.AddInt32(&c.feedsChecked, 1)
|
||||
|
||||
// Try different scheme/www combinations since we store URLs without scheme
|
||||
urlVariants := []string{
|
||||
"https://" + feed.URL,
|
||||
"http://" + feed.URL,
|
||||
"https://www." + feed.URL,
|
||||
"http://www." + feed.URL,
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
var err error
|
||||
var successURL string
|
||||
|
||||
for _, tryURL := range urlVariants {
|
||||
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||
if reqErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.UserAgent)
|
||||
|
||||
// Add conditional headers if we have them
|
||||
if feed.ETag != "" {
|
||||
req.Header.Set("If-None-Match", feed.ETag)
|
||||
}
|
||||
if feed.LastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||
}
|
||||
|
||||
resp, err = c.client.Do(req)
|
||||
if err == nil {
|
||||
successURL = tryURL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
_ = successURL // May be used later for logging/debugging
|
||||
|
||||
// If no request succeeded, resp will be nil
|
||||
if resp == nil {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("all URL variants failed")
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
|
||||
// 304 Not Modified - feed hasn't changed
|
||||
if resp.StatusCode == http.StatusNotModified {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
|
||||
body := string(bodyBytes)
|
||||
|
||||
// Update cache headers
|
||||
feed.ETag = resp.Header.Get("ETag")
|
||||
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||
|
||||
// Re-detect type and parse metadata
|
||||
feedType := c.detectFeedType(body)
|
||||
feed.Type = feedType
|
||||
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
@@ -0,0 +1,262 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
|
||||
// Format: {domain}-{category}.1440.news
|
||||
// AT Protocol allows up to 63 characters per label, but the PDS
|
||||
// restricts the first segment to 18 characters for local handles.
|
||||
// Examples:
|
||||
//
|
||||
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
|
||||
// news.ycombinator.com/rss → ycombinator.1440.news
|
||||
func DeriveHandleFromFeed(feedURL string) string {
|
||||
const maxSubdomainLen = 18 // PDS limit for first segment
|
||||
|
||||
// Ensure we have a scheme for parsing
|
||||
if !strings.Contains(feedURL, "://") {
|
||||
feedURL = "https://" + feedURL
|
||||
}
|
||||
|
||||
u, err := url.Parse(feedURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
hostname := strings.ToLower(u.Hostname())
|
||||
path := strings.ToLower(u.Path)
|
||||
|
||||
// Remove common feed suffixes/extensions
|
||||
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
|
||||
for _, suffix := range suffixesToRemove {
|
||||
path = strings.TrimSuffix(path, suffix)
|
||||
}
|
||||
|
||||
// Split path into segments and filter noise
|
||||
segments := strings.Split(strings.Trim(path, "/"), "/")
|
||||
skipPathWords := map[string]bool{
|
||||
"rss": true, "feed": true, "feeds": true, "atom": true,
|
||||
"xml": true, "default": true, "index": true, "services": true,
|
||||
"nyt": true,
|
||||
}
|
||||
|
||||
var pathParts []string
|
||||
for _, seg := range segments {
|
||||
seg = cleanHandleSegment(seg)
|
||||
if seg != "" && !skipPathWords[seg] {
|
||||
pathParts = append(pathParts, seg)
|
||||
}
|
||||
}
|
||||
|
||||
// Split hostname and extract the meaningful domain
|
||||
hostParts := strings.Split(hostname, ".")
|
||||
|
||||
// Two-part TLDs to handle specially
|
||||
twoPartTLDs := map[string]bool{
|
||||
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
|
||||
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
|
||||
}
|
||||
|
||||
// Check for two-part TLD
|
||||
if len(hostParts) >= 2 {
|
||||
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
|
||||
if twoPartTLDs[possibleTwoPartTLD] {
|
||||
hostParts = hostParts[:len(hostParts)-2]
|
||||
} else {
|
||||
// Single TLD - remove it
|
||||
singleTLDs := map[string]bool{
|
||||
"com": true, "org": true, "net": true, "io": true,
|
||||
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
|
||||
}
|
||||
if singleTLDs[hostParts[len(hostParts)-1]] {
|
||||
hostParts = hostParts[:len(hostParts)-1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip noise subdomains
|
||||
skipHostWords := map[string]bool{
|
||||
"www": true, "feeds": true, "rss": true, "feed": true,
|
||||
"api": true, "cdn": true, "static": true, "news": true,
|
||||
}
|
||||
|
||||
var meaningfulHostParts []string
|
||||
for _, part := range hostParts {
|
||||
if !skipHostWords[part] && part != "" {
|
||||
meaningfulHostParts = append(meaningfulHostParts, part)
|
||||
}
|
||||
}
|
||||
|
||||
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
|
||||
var mainDomain string
|
||||
if len(meaningfulHostParts) > 0 {
|
||||
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
|
||||
} else if len(hostParts) > 0 {
|
||||
mainDomain = hostParts[len(hostParts)-1]
|
||||
}
|
||||
|
||||
// Special case: "bbci" should become "bbc"
|
||||
if mainDomain == "bbci" {
|
||||
mainDomain = "bbc"
|
||||
}
|
||||
|
||||
// Abbreviations for long category names to fit 18-char limit
|
||||
categoryAbbrevs := map[string]string{
|
||||
"science-and-environment": "sci-env",
|
||||
"entertainment-and-arts": "ent-arts",
|
||||
"science-environment": "sci-env",
|
||||
"entertainment-arts": "ent-arts",
|
||||
"technology": "tech",
|
||||
"business": "biz",
|
||||
"international": "intl",
|
||||
"environment": "env",
|
||||
"entertainment": "ent",
|
||||
"politics": "pol",
|
||||
}
|
||||
|
||||
// Build subdomain: domain + category (from path)
|
||||
var subdomain string
|
||||
if len(pathParts) > 0 {
|
||||
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
|
||||
category := pathParts[len(pathParts)-1]
|
||||
// Skip generic categories
|
||||
if category == "news" && len(pathParts) == 1 {
|
||||
subdomain = mainDomain
|
||||
} else {
|
||||
// Try to abbreviate if the full subdomain would be too long
|
||||
fullSubdomain := mainDomain + "-" + category
|
||||
if len(fullSubdomain) > maxSubdomainLen {
|
||||
if abbrev, ok := categoryAbbrevs[category]; ok {
|
||||
category = abbrev
|
||||
}
|
||||
}
|
||||
subdomain = mainDomain + "-" + category
|
||||
}
|
||||
} else {
|
||||
subdomain = mainDomain
|
||||
}
|
||||
|
||||
// If still too long, just use main hostname
|
||||
if len(subdomain) > maxSubdomainLen {
|
||||
subdomain = mainDomain
|
||||
}
|
||||
|
||||
// Final safety: truncate if still too long
|
||||
if len(subdomain) > maxSubdomainLen {
|
||||
subdomain = subdomain[:maxSubdomainLen]
|
||||
}
|
||||
|
||||
subdomain = strings.Trim(subdomain, "-")
|
||||
|
||||
// Collapse multiple hyphens
|
||||
for strings.Contains(subdomain, "--") {
|
||||
subdomain = strings.ReplaceAll(subdomain, "--", "-")
|
||||
}
|
||||
|
||||
return subdomain + ".1440.news"
|
||||
}
|
||||
|
||||
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
|
||||
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
|
||||
func cleanHandleSegment(s string) string {
|
||||
// Remove file extensions
|
||||
if idx := strings.LastIndex(s, "."); idx > 0 {
|
||||
s = s[:idx]
|
||||
}
|
||||
|
||||
// Convert to lowercase
|
||||
s = strings.ToLower(s)
|
||||
|
||||
// Strip common feed prefixes/suffixes from the segment itself
|
||||
// e.g., "showrss" → "show", "rssworld" → "world"
|
||||
feedAffixes := []string{"rss", "feed", "atom", "xml"}
|
||||
for _, affix := range feedAffixes {
|
||||
// Strip suffix (e.g., "showrss" → "show")
|
||||
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
|
||||
s = strings.TrimSuffix(s, affix)
|
||||
break
|
||||
}
|
||||
// Strip prefix (e.g., "rssworld" → "world")
|
||||
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
|
||||
s = strings.TrimPrefix(s, affix)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Replace underscores and other separators with hyphens
|
||||
s = strings.ReplaceAll(s, "_", "-")
|
||||
s = strings.ReplaceAll(s, " ", "-")
|
||||
|
||||
// Remove any characters that aren't alphanumeric or hyphens
|
||||
reg := regexp.MustCompile(`[^a-z0-9-]`)
|
||||
s = reg.ReplaceAllString(s, "")
|
||||
|
||||
// Collapse multiple hyphens
|
||||
for strings.Contains(s, "--") {
|
||||
s = strings.ReplaceAll(s, "--", "-")
|
||||
}
|
||||
|
||||
// Trim leading/trailing hyphens
|
||||
s = strings.Trim(s, "-")
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// SplitHandle extracts the path prefix and hostname from a derived handle
|
||||
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
|
||||
func SplitHandle(handle string) (prefix string, hostname string) {
|
||||
// Remove .1440.news suffix
|
||||
handle = strings.TrimSuffix(handle, ".1440.news")
|
||||
|
||||
parts := strings.Split(handle, ".")
|
||||
|
||||
// Try to find where hostname starts by looking for valid hostname patterns
|
||||
if len(parts) >= 2 {
|
||||
for i := 0; i < len(parts)-1; i++ {
|
||||
remaining := strings.Join(parts[i:], ".")
|
||||
if looksLikeHostname(remaining) {
|
||||
if i > 0 {
|
||||
prefix = strings.Join(parts[:i], ".")
|
||||
}
|
||||
hostname = remaining
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: no prefix, entire thing is hostname
|
||||
hostname = handle
|
||||
return "", hostname
|
||||
}
|
||||
|
||||
func isLikelyTLDPart(s string) bool {
|
||||
tlds := map[string]bool{
|
||||
"com": true, "org": true, "net": true, "edu": true, "gov": true,
|
||||
"io": true, "co": true, "uk": true, "de": true, "fr": true,
|
||||
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
|
||||
"news": true, "blog": true, "tech": true, "dev": true,
|
||||
}
|
||||
return tlds[s]
|
||||
}
|
||||
|
||||
func isTwoPartTLD(first, second string) bool {
|
||||
twoPartTLDs := map[string]bool{
|
||||
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
|
||||
"org.uk": true, "net.au": true, "com.br": true,
|
||||
}
|
||||
return twoPartTLDs[first+"."+second]
|
||||
}
|
||||
|
||||
func looksLikeHostname(s string) bool {
|
||||
// A hostname typically has at least one dot and ends with a TLD-like part
|
||||
parts := strings.Split(s, ".")
|
||||
if len(parts) < 2 {
|
||||
return false
|
||||
}
|
||||
lastPart := parts[len(parts)-1]
|
||||
return isLikelyTLDPart(lastPart)
|
||||
}
|
||||
@@ -0,0 +1,381 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/gif"
|
||||
"image/jpeg"
|
||||
_ "image/png"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.deanishe.net/favicon"
|
||||
"golang.org/x/image/draw"
|
||||
_ "golang.org/x/image/webp"
|
||||
)
|
||||
|
||||
// ImageUploadResult contains the uploaded blob and image dimensions
|
||||
type ImageUploadResult struct {
|
||||
Blob *BlobRef
|
||||
Width int
|
||||
Height int
|
||||
}
|
||||
|
||||
// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
|
||||
func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
|
||||
var images []BskyImage
|
||||
maxImages := 4
|
||||
if len(imageURLs) < maxImages {
|
||||
maxImages = len(imageURLs)
|
||||
}
|
||||
|
||||
for i := 0; i < maxImages; i++ {
|
||||
result := p.fetchAndUploadImageWithDimensions(session, imageURLs[i])
|
||||
if result != nil && result.Blob != nil {
|
||||
img := BskyImage{
|
||||
Alt: altText,
|
||||
Image: result.Blob,
|
||||
}
|
||||
if result.Width > 0 && result.Height > 0 {
|
||||
img.AspectRatio = &BskyAspectRatio{
|
||||
Width: result.Width,
|
||||
Height: result.Height,
|
||||
}
|
||||
}
|
||||
images = append(images, img)
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// FetchFavicon tries to get a favicon URL for a site
|
||||
// Uses go.deanishe.net/favicon library which parses HTML, manifests, and checks common paths
|
||||
// Returns the favicon URL or empty string if not found
|
||||
func (p *Publisher) FetchFavicon(siteURL string) string {
|
||||
if siteURL == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Ensure URL has scheme
|
||||
if !strings.Contains(siteURL, "://") {
|
||||
siteURL = "https://" + siteURL
|
||||
}
|
||||
u, err := url.Parse(siteURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Create finder with custom HTTP client
|
||||
// Note: Don't use IgnoreNoSize as it filters out valid favicon.ico files that don't have size metadata
|
||||
finder := favicon.New(
|
||||
favicon.WithClient(p.httpClient),
|
||||
)
|
||||
|
||||
// Find icons - library checks HTML <link> tags, manifests, OG images, common paths
|
||||
icons, err := finder.Find(siteURL)
|
||||
if err == nil && len(icons) > 0 {
|
||||
// Filter and score icons for avatar use
|
||||
// Prefer: square icons, reasonable size, PNG format, actual favicons over OG images
|
||||
var bestIcon string
|
||||
var bestScore int
|
||||
|
||||
for _, icon := range icons {
|
||||
// Skip tiny icons (likely tracking pixels)
|
||||
if icon.Width > 0 && icon.Width < 32 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip Open Graph images (meant for link previews, usually wide banners)
|
||||
lowerURL := strings.ToLower(icon.URL)
|
||||
if strings.Contains(lowerURL, "og-image") || strings.Contains(lowerURL, "og_image") ||
|
||||
strings.Contains(lowerURL, "opengraph") || strings.Contains(lowerURL, "twitter") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip wide images (aspect ratio > 1.5 means it's a banner, not an icon)
|
||||
if icon.Width > 0 && icon.Height > 0 {
|
||||
ratio := float64(icon.Width) / float64(icon.Height)
|
||||
if ratio > 1.5 || ratio < 0.67 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Score the icon
|
||||
score := 0
|
||||
|
||||
// Prefer actual favicon paths
|
||||
if strings.Contains(lowerURL, "favicon") || strings.Contains(lowerURL, "icon") ||
|
||||
strings.Contains(lowerURL, "apple-touch") {
|
||||
score += 100
|
||||
}
|
||||
|
||||
// Prefer PNG over other formats
|
||||
if icon.MimeType == "image/png" {
|
||||
score += 50
|
||||
} else if icon.MimeType == "image/x-icon" || strings.HasSuffix(lowerURL, ".ico") {
|
||||
score += 40
|
||||
} else if icon.MimeType == "image/jpeg" {
|
||||
score += 10 // JPEG less preferred for icons
|
||||
}
|
||||
|
||||
// Prefer larger icons (but not too large)
|
||||
if icon.Width >= 64 && icon.Width <= 512 {
|
||||
score += 30
|
||||
} else if icon.Width > 0 {
|
||||
score += 10
|
||||
}
|
||||
|
||||
if score > bestScore {
|
||||
bestScore = score
|
||||
bestIcon = icon.URL
|
||||
}
|
||||
}
|
||||
|
||||
if bestIcon != "" {
|
||||
return bestIcon
|
||||
}
|
||||
|
||||
// Fall back to first non-OG icon
|
||||
for _, icon := range icons {
|
||||
lowerURL := strings.ToLower(icon.URL)
|
||||
if !strings.Contains(lowerURL, "og-image") && !strings.Contains(lowerURL, "og_image") {
|
||||
return icon.URL
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to Google's favicon service (reliable, returns PNG)
|
||||
return fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
|
||||
}
|
||||
|
||||
func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
|
||||
result := p.fetchAndUploadImageWithDimensions(session, imageURL)
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
return result.Blob
|
||||
}
|
||||
|
||||
// upgradeImageURL attempts to get a larger version of known CDN image URLs
|
||||
func upgradeImageURL(imageURL string) string {
|
||||
// BBC images: /standard/240/ -> /standard/800/
|
||||
if strings.Contains(imageURL, "ichef.bbci.co.uk") {
|
||||
imageURL = strings.Replace(imageURL, "/standard/240/", "/standard/800/", 1)
|
||||
imageURL = strings.Replace(imageURL, "/standard/480/", "/standard/800/", 1)
|
||||
}
|
||||
return imageURL
|
||||
}
|
||||
|
||||
func (p *Publisher) fetchAndUploadImageWithDimensions(session *PDSSession, imageURL string) *ImageUploadResult {
|
||||
// Upgrade image URL to larger size if possible
|
||||
imageURL = upgradeImageURL(imageURL)
|
||||
|
||||
// Fetch the image
|
||||
resp, err := p.httpClient.Get(imageURL)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check content type
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if contentType == "" {
|
||||
// Try to guess from URL
|
||||
if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
|
||||
contentType = "image/png"
|
||||
} else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
|
||||
contentType = "image/gif"
|
||||
} else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
|
||||
contentType = "image/webp"
|
||||
} else {
|
||||
contentType = "image/jpeg" // Default
|
||||
}
|
||||
}
|
||||
|
||||
// Only accept image types
|
||||
if !strings.HasPrefix(contentType, "image/") {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read image data (limit to 2MB to allow for resize headroom)
|
||||
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
|
||||
if err != nil || len(data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Decode image to get dimensions
|
||||
imgConfig, _, err := image.DecodeConfig(bytes.NewReader(data))
|
||||
width, height := 1, 1 // Default if decode fails
|
||||
if err == nil {
|
||||
width, height = imgConfig.Width, imgConfig.Height
|
||||
}
|
||||
|
||||
// Bluesky blob limit is ~976KB, use 900KB as safe threshold
|
||||
const maxBlobSize = 900 * 1024
|
||||
|
||||
// If image is too large, resize it
|
||||
if len(data) > maxBlobSize {
|
||||
// Decode the full image for resizing
|
||||
img, _, err := image.Decode(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil // Can't decode, can't resize
|
||||
}
|
||||
|
||||
// Scale down iteratively until under limit
|
||||
scaleFactor := 0.9 // Start with 90% and iterate if needed
|
||||
|
||||
for attempt := 0; attempt < 5; attempt++ {
|
||||
newWidth := int(float64(width) * scaleFactor)
|
||||
newHeight := int(float64(height) * scaleFactor)
|
||||
|
||||
// Minimum dimensions
|
||||
if newWidth < 100 {
|
||||
newWidth = 100
|
||||
}
|
||||
if newHeight < 100 {
|
||||
newHeight = 100
|
||||
}
|
||||
|
||||
// Create resized image
|
||||
resized := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
|
||||
draw.CatmullRom.Scale(resized, resized.Bounds(), img, img.Bounds(), draw.Over, nil)
|
||||
|
||||
// Encode as JPEG
|
||||
var buf bytes.Buffer
|
||||
if err := jpeg.Encode(&buf, resized, &jpeg.Options{Quality: 85}); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if buf.Len() <= maxBlobSize {
|
||||
data = buf.Bytes()
|
||||
width = newWidth
|
||||
height = newHeight
|
||||
contentType = "image/jpeg"
|
||||
break
|
||||
}
|
||||
|
||||
// Still too large, reduce scale further
|
||||
scaleFactor *= 0.8
|
||||
}
|
||||
|
||||
// If still too large after 5 attempts, give up
|
||||
if len(data) > maxBlobSize {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Upload to PDS
|
||||
blob, err := p.UploadBlob(session, data, contentType)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &ImageUploadResult{
|
||||
Blob: blob,
|
||||
Width: width,
|
||||
Height: height,
|
||||
}
|
||||
}
|
||||
|
||||
// FetchFavicon downloads a favicon/icon from a URL
|
||||
// Uses go.deanishe.net/favicon library to find the best icon
|
||||
// Returns the favicon URL or empty string if not found
|
||||
func FetchFaviconBytes(siteURL string) ([]byte, string, error) {
|
||||
if !strings.HasPrefix(siteURL, "http") {
|
||||
siteURL = "https://" + siteURL
|
||||
}
|
||||
|
||||
u, err := url.Parse(siteURL)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
// Use favicon library to find icons
|
||||
finder := favicon.New(
|
||||
favicon.WithClient(client),
|
||||
favicon.IgnoreNoSize,
|
||||
)
|
||||
|
||||
icons, err := finder.Find(siteURL)
|
||||
if err != nil || len(icons) == 0 {
|
||||
// Fallback to Google's favicon service
|
||||
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
|
||||
return fetchIconBytes(client, googleURL)
|
||||
}
|
||||
|
||||
// Try icons in order (sorted by size, largest first)
|
||||
// Prefer PNG/JPEG over ICO
|
||||
var iconURLs []string
|
||||
for _, icon := range icons {
|
||||
if icon.Width > 0 && icon.Width < 32 {
|
||||
continue // Skip tiny icons
|
||||
}
|
||||
if icon.MimeType == "image/png" || icon.MimeType == "image/jpeg" {
|
||||
iconURLs = append([]string{icon.URL}, iconURLs...) // Prepend PNG/JPEG
|
||||
} else {
|
||||
iconURLs = append(iconURLs, icon.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// If no good icons, use all of them
|
||||
if len(iconURLs) == 0 {
|
||||
for _, icon := range icons {
|
||||
iconURLs = append(iconURLs, icon.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// Try to download each icon
|
||||
for _, iconURL := range iconURLs {
|
||||
data, mimeType, err := fetchIconBytes(client, iconURL)
|
||||
if err == nil && len(data) > 0 {
|
||||
return data, mimeType, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Final fallback to Google
|
||||
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
|
||||
return fetchIconBytes(client, googleURL)
|
||||
}
|
||||
|
||||
// fetchIconBytes downloads an icon and returns its bytes and mime type
|
||||
func fetchIconBytes(client *http.Client, iconURL string) ([]byte, string, error) {
|
||||
resp, err := client.Get(iconURL)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
// Determine mime type
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if contentType == "" {
|
||||
if strings.HasSuffix(iconURL, ".png") {
|
||||
contentType = "image/png"
|
||||
} else if strings.HasSuffix(iconURL, ".ico") {
|
||||
contentType = "image/x-icon"
|
||||
} else {
|
||||
contentType = "image/png"
|
||||
}
|
||||
}
|
||||
|
||||
return data, contentType, nil
|
||||
}
|
||||
@@ -0,0 +1,328 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
// Enclosure represents a media attachment (audio, video, image)
|
||||
type Enclosure struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||
Length int64 `json:"length"` // Size in bytes
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
type Item struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
FeedURL string `json:"feed_url"`
|
||||
GUID string `json:"guid,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Link string `json:"link,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Author string `json:"author,omitempty"`
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
|
||||
// Media attachments
|
||||
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
|
||||
|
||||
// Publishing to PDS
|
||||
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||
PublishedUri string `json:"published_uri,omitempty"`
|
||||
}
|
||||
|
||||
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveItems stores multiple items efficiently
|
||||
func (c *Crawler) saveItems(items []*Item) error {
|
||||
if len(items) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
tx, err := c.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback(context.Background())
|
||||
|
||||
for _, item := range items {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := tx.Exec(context.Background(), `
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit(context.Background())
|
||||
}
|
||||
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1
|
||||
ORDER BY pub_date DESC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
tsquery := ToSearchQuery(query)
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE search_vector @@ to_tsquery('english', $1)
|
||||
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
|
||||
LIMIT $2
|
||||
`, tsquery, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// scanItems is a helper to scan multiple item rows
|
||||
func scanItems(rows pgx.Rows) ([]*Item, error) {
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author *string
|
||||
var pubDate, updatedAt, publishedAt *time.Time
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
var imageUrlsJSON, tagsJSON *string
|
||||
var publishedUri *string
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
|
||||
&publishedAt, &publishedUri,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
item.GUID = StringValue(guid)
|
||||
item.Title = StringValue(title)
|
||||
item.Link = StringValue(link)
|
||||
item.Description = StringValue(description)
|
||||
item.Content = StringValue(content)
|
||||
item.Author = StringValue(author)
|
||||
item.PubDate = TimeValue(pubDate)
|
||||
item.UpdatedAt = TimeValue(updatedAt)
|
||||
|
||||
// Parse enclosure
|
||||
if enclosureUrl != nil && *enclosureUrl != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: *enclosureUrl,
|
||||
Type: StringValue(enclosureType),
|
||||
}
|
||||
if enclosureLength != nil {
|
||||
item.Enclosure.Length = *enclosureLength
|
||||
}
|
||||
}
|
||||
|
||||
// Parse imageUrls JSON
|
||||
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
|
||||
item.ImageURLs = urls
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tags JSON
|
||||
if tagsJSON != nil && *tagsJSON != "" {
|
||||
var tags []string
|
||||
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
|
||||
item.Tags = tags
|
||||
}
|
||||
}
|
||||
|
||||
item.PublishedAt = TimeValue(publishedAt)
|
||||
item.PublishedUri = StringValue(publishedUri)
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// CleanupOldItems removes items older than 12 months
|
||||
func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
||||
result, err := c.db.Exec(`
|
||||
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
|
||||
`, cutoff)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
||||
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1 AND published_at IS NULL
|
||||
ORDER BY pub_date ASC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// MarkItemPublished marks an item as published with the given URI
|
||||
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
|
||||
`, uri, itemID)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
||||
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow(`
|
||||
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
|
||||
`, feedURL).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
+187
@@ -0,0 +1,187 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CreateSession authenticates with the PDS and returns a session
|
||||
func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
|
||||
payload := map[string]string{
|
||||
"identifier": handle,
|
||||
"password": password,
|
||||
}
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Post(
|
||||
p.pdsHost+"/xrpc/com.atproto.server.createSession",
|
||||
"application/json",
|
||||
bytes.NewReader(body),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var session PDSSession
|
||||
if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &session, nil
|
||||
}
|
||||
|
||||
// CreateAccount creates a new account on the PDS
|
||||
// Requires an invite code if the PDS has invites enabled
|
||||
func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
|
||||
payload := map[string]interface{}{
|
||||
"handle": handle,
|
||||
"email": email,
|
||||
"password": password,
|
||||
}
|
||||
if inviteCode != "" {
|
||||
payload["inviteCode"] = inviteCode
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Post(
|
||||
p.pdsHost+"/xrpc/com.atproto.server.createAccount",
|
||||
"application/json",
|
||||
bytes.NewReader(body),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var session PDSSession
|
||||
if err := json.Unmarshal(respBody, &session); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &session, nil
|
||||
}
|
||||
|
||||
// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
|
||||
func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
|
||||
payload := map[string]interface{}{
|
||||
"useCount": useCount,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
// PDS admin APIs use Basic Auth with "admin" as username
|
||||
req.SetBasicAuth("admin", adminPassword)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Code string `json:"code"`
|
||||
}
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result.Code, nil
|
||||
}
|
||||
|
||||
// FollowAccount creates a follow record from the authenticated session to the target DID
|
||||
func (p *Publisher) FollowAccount(session *PDSSession, targetDID string) error {
|
||||
// Create follow record
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
record := map[string]interface{}{
|
||||
"$type": "app.bsky.graph.follow",
|
||||
"subject": targetDID,
|
||||
"createdAt": now,
|
||||
}
|
||||
|
||||
payload := map[string]interface{}{
|
||||
"repo": session.DID,
|
||||
"collection": "app.bsky.graph.follow",
|
||||
"record": record,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("follow failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// FollowAsDirectory logs in as the directory account and follows the target DID
|
||||
func (p *Publisher) FollowAsDirectory(targetDID string) error {
|
||||
dirHandle := os.Getenv("DIRECTORY_HANDLE")
|
||||
dirPassword := os.Getenv("DIRECTORY_PASSWORD")
|
||||
|
||||
if dirHandle == "" || dirPassword == "" {
|
||||
// Silently skip if directory account not configured
|
||||
return nil
|
||||
}
|
||||
|
||||
session, err := p.CreateSession(dirHandle, dirPassword)
|
||||
if err != nil {
|
||||
return fmt.Errorf("directory login failed: %w", err)
|
||||
}
|
||||
|
||||
return p.FollowAccount(session, targetDID)
|
||||
}
|
||||
+272
@@ -0,0 +1,272 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// BlobRef represents a blob reference for profile images
|
||||
type BlobRef struct {
|
||||
Type string `json:"$type"`
|
||||
Ref Link `json:"ref"`
|
||||
MimeType string `json:"mimeType"`
|
||||
Size int64 `json:"size"`
|
||||
}
|
||||
|
||||
type Link struct {
|
||||
Link string `json:"$link"`
|
||||
}
|
||||
|
||||
// UploadBlob uploads an image to the PDS and returns a blob reference
|
||||
func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", mimeType)
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Blob BlobRef `json:"blob"`
|
||||
}
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result.Blob, nil
|
||||
}
|
||||
|
||||
// UpdateProfile updates the profile for an account
|
||||
func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
|
||||
// First, get the current profile to preserve any existing fields
|
||||
getReq, err := http.NewRequest("GET",
|
||||
p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
|
||||
nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
getResp, err := p.httpClient.Do(getReq)
|
||||
|
||||
var existingCID string
|
||||
profile := map[string]interface{}{
|
||||
"$type": "app.bsky.actor.profile",
|
||||
}
|
||||
|
||||
if err == nil && getResp.StatusCode == http.StatusOK {
|
||||
defer getResp.Body.Close()
|
||||
var existing struct {
|
||||
CID string `json:"cid"`
|
||||
Value map[string]interface{} `json:"value"`
|
||||
}
|
||||
if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
|
||||
existingCID = existing.CID
|
||||
profile = existing.Value
|
||||
}
|
||||
} else if getResp != nil {
|
||||
getResp.Body.Close()
|
||||
}
|
||||
|
||||
// Update fields
|
||||
if displayName != "" {
|
||||
profile["displayName"] = displayName
|
||||
}
|
||||
if description != "" {
|
||||
profile["description"] = description
|
||||
}
|
||||
if avatar != nil {
|
||||
profile["avatar"] = avatar
|
||||
}
|
||||
|
||||
// Put the record
|
||||
payload := map[string]interface{}{
|
||||
"repo": session.DID,
|
||||
"collection": "app.bsky.actor.profile",
|
||||
"rkey": "self",
|
||||
"record": profile,
|
||||
}
|
||||
if existingCID != "" {
|
||||
payload["swapRecord"] = existingCID
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteAllPosts deletes all posts from an account
|
||||
func (p *Publisher) DeleteAllPosts(session *PDSSession) (int, error) {
|
||||
deleted := 0
|
||||
cursor := ""
|
||||
|
||||
for {
|
||||
// List records
|
||||
listURL := fmt.Sprintf("%s/xrpc/com.atproto.repo.listRecords?repo=%s&collection=app.bsky.feed.post&limit=100",
|
||||
p.pdsHost, session.DID)
|
||||
if cursor != "" {
|
||||
listURL += "&cursor=" + url.QueryEscape(cursor)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", listURL, nil)
|
||||
if err != nil {
|
||||
return deleted, err
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return deleted, err
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Records []struct {
|
||||
URI string `json:"uri"`
|
||||
} `json:"records"`
|
||||
Cursor string `json:"cursor"`
|
||||
}
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return deleted, fmt.Errorf("list records failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return deleted, err
|
||||
}
|
||||
|
||||
if len(result.Records) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// Delete each record
|
||||
for _, record := range result.Records {
|
||||
// Extract rkey from URI: at://did:plc:xxx/app.bsky.feed.post/rkey
|
||||
parts := strings.Split(record.URI, "/")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
rkey := parts[len(parts)-1]
|
||||
|
||||
if err := p.DeleteRecord(session, "app.bsky.feed.post", rkey); err != nil {
|
||||
// Continue deleting other records even if one fails
|
||||
continue
|
||||
}
|
||||
deleted++
|
||||
}
|
||||
|
||||
cursor = result.Cursor
|
||||
if cursor == "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return deleted, nil
|
||||
}
|
||||
|
||||
// DeleteRecord deletes a single record from an account
|
||||
func (p *Publisher) DeleteRecord(session *PDSSession, collection, rkey string) error {
|
||||
payload := map[string]interface{}{
|
||||
"repo": session.DID,
|
||||
"collection": collection,
|
||||
"rkey": rkey,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.deleteRecord", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("delete record failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteAccount deletes an account using PDS admin API
|
||||
func (p *Publisher) DeleteAccount(adminPassword, did string) error {
|
||||
payload := map[string]interface{}{
|
||||
"did": did,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.admin.deleteAccount", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.SetBasicAuth("admin", adminPassword)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("delete account failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
-1063
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,160 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (c *Crawler) StartDashboard(addr string) error {
|
||||
http.HandleFunc("/dashboard", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleDashboard(w, r)
|
||||
})
|
||||
|
||||
// Root handler for url.1440.news short URLs and 1440.news accounts directory
|
||||
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.Host
|
||||
// Strip port if present
|
||||
if idx := strings.Index(host, ":"); idx != -1 {
|
||||
host = host[:idx]
|
||||
}
|
||||
|
||||
// If this is url.1440.news, treat path as short code
|
||||
if host == "url.1440.news" || host == "url.1440.localhost" {
|
||||
c.handleRedirect(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// If this is 1440.news (apex), serve accounts directory
|
||||
if host == "1440.news" || host == "1440.localhost" {
|
||||
if r.URL.Path == "/" || r.URL.Path == "" {
|
||||
c.handleAccountsDirectory(w, r)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, redirect to dashboard for root path
|
||||
if r.URL.Path == "/" {
|
||||
http.Redirect(w, r, "/dashboard", http.StatusFound)
|
||||
return
|
||||
}
|
||||
|
||||
// Unknown path
|
||||
http.NotFound(w, r)
|
||||
})
|
||||
|
||||
http.HandleFunc("/api/stats", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIStats(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/allDomains", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIAllDomains(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/domainFeeds", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDomainFeeds(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/feedInfo", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIFeedInfo(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/feedItems", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIFeedItems(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPISearch(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/tlds", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPITLDs(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/tldDomains", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPITLDDomains(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/revisitDomain", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIRevisitDomain(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/priorityCrawl", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPriorityCrawl(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/checkFeed", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPICheckFeed(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/domainsByStatus", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDomainsByStatus(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/feedsByStatus", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIFeedsByStatus(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/domains", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDomains(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/feeds", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIFeeds(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/setDomainStatus", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPISetDomainStatus(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/filter", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIFilter(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/enablePublish", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIEnablePublish(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/disablePublish", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDisablePublish(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/publishEnabled", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPublishEnabled(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/publishDenied", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPublishDenied(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/publishCandidates", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPublishCandidates(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/setPublishStatus", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPISetPublishStatus(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/unpublishedItems", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIUnpublishedItems(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/testPublish", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPITestPublish(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/deriveHandle", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDeriveHandle(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/publishFeed", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPublishFeed(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/createAccount", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPICreateAccount(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/publishFeedFull", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIPublishFeedFull(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/updateProfile", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIUpdateProfile(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/languages", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPILanguages(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/denyDomain", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIDenyDomain(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/undenyDomain", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIUndenyDomain(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/tldStats", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPITLDStats(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/resetAllPublishing", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIResetAllPublishing(w, r)
|
||||
})
|
||||
http.HandleFunc("/api/refreshProfiles", func(w http.ResponseWriter, r *http.Request) {
|
||||
c.handleAPIRefreshProfiles(w, r)
|
||||
})
|
||||
http.HandleFunc("/static/", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.StripPrefix("/static/", http.FileServer(http.Dir("static"))).ServeHTTP(w, r)
|
||||
})
|
||||
|
||||
fmt.Printf("Dashboard running at http://%s\n", addr)
|
||||
return http.ListenAndServe(addr, nil)
|
||||
}
|
||||
+75
-29
@@ -435,7 +435,7 @@ function initDashboard() {
|
||||
// Insert TLD header if TLD changed
|
||||
if (d.tld && d.tld !== currentTLD) {
|
||||
currentTLD = d.tld;
|
||||
container.insertAdjacentHTML('beforeend', `<div class="tld-header" style="padding: 12px 10px 6px; color: #666; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'};">.${escapeHtml(currentTLD)}</div>`);
|
||||
container.insertAdjacentHTML('beforeend', `<div class="tld-header" data-tld="${escapeHtml(currentTLD)}" style="padding: 12px 10px 6px; color: #888; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'}; cursor: pointer;"><span style="margin-right: 6px; font-size: 0.8em;">▼</span>.${escapeHtml(currentTLD)}</div>`);
|
||||
}
|
||||
container.insertAdjacentHTML('beforeend', renderDomainRow(d));
|
||||
});
|
||||
@@ -556,7 +556,7 @@ function initDashboard() {
|
||||
// Insert TLD header if TLD changed
|
||||
if (d.tld && d.tld !== currentTLD) {
|
||||
currentTLD = d.tld;
|
||||
container.insertAdjacentHTML('beforeend', `<div class="tld-header" style="padding: 12px 10px 6px; color: #666; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'};">.${escapeHtml(currentTLD)}</div>`);
|
||||
container.insertAdjacentHTML('beforeend', `<div class="tld-header" data-tld="${escapeHtml(currentTLD)}" style="padding: 12px 10px 6px; color: #888; font-size: 0.9em; border-bottom: 1px solid #333; margin-top: ${offset === 0 && !container.querySelector('.domain-block') ? '0' : '15px'}; cursor: pointer;"><span style="margin-right: 6px; font-size: 0.8em;">▼</span>.${escapeHtml(currentTLD)}</div>`);
|
||||
}
|
||||
container.insertAdjacentHTML('beforeend', renderDomainRow(d));
|
||||
});
|
||||
@@ -883,11 +883,11 @@ function initDashboard() {
|
||||
const isSelected = selectedTLDs.has(t.tld);
|
||||
const bg = isSelected ? '#036' : '#1a1a1a';
|
||||
const border = isSelected ? '#06c' : '#333';
|
||||
const color = isSelected ? '#0af' : '#666';
|
||||
const color = isSelected ? '#0af' : '#fff';
|
||||
html += `<button class="tld-btn" data-tld="${escapeHtml(t.tld)}"
|
||||
style="padding: 4px 8px; font-size: 11px; font-family: monospace;
|
||||
background: ${bg}; border: 1px solid ${border}; border-radius: 3px;
|
||||
color: ${color}; cursor: pointer;">.${escapeHtml(t.tld)} <span style="color: #555;">(${t.domain_count})</span></button>`;
|
||||
color: ${color}; cursor: pointer;">.${escapeHtml(t.tld)} <span style="color: #888;">(${t.domain_count})</span></button>`;
|
||||
});
|
||||
|
||||
// Add clear button if any selected
|
||||
@@ -923,6 +923,9 @@ function initDashboard() {
|
||||
}
|
||||
}
|
||||
|
||||
// Track manually collapsed TLDs (collapsed by clicking header)
|
||||
let collapsedTLDs = new Set();
|
||||
|
||||
function applyTLDFilter() {
|
||||
const container = document.querySelector('.domain-list');
|
||||
if (!container) return;
|
||||
@@ -930,44 +933,87 @@ function initDashboard() {
|
||||
const showAll = selectedTLDs.size === 0;
|
||||
|
||||
// Handle TLD headers and their domain blocks
|
||||
container.querySelectorAll('.tld-header:not(.tld-handled)').forEach(header => {
|
||||
header.classList.add('tld-handled');
|
||||
header.style.cursor = 'pointer';
|
||||
|
||||
header.addEventListener('click', () => {
|
||||
const tldText = header.dataset.tld;
|
||||
|
||||
// If TLD filter is active, clicking adds/removes from filter
|
||||
if (selectedTLDs.size > 0) {
|
||||
if (selectedTLDs.has(tldText)) {
|
||||
selectedTLDs.delete(tldText);
|
||||
} else {
|
||||
selectedTLDs.add(tldText);
|
||||
}
|
||||
renderTLDButtons();
|
||||
} else {
|
||||
// No filter active - toggle manual collapse
|
||||
if (collapsedTLDs.has(tldText)) {
|
||||
collapsedTLDs.delete(tldText);
|
||||
} else {
|
||||
collapsedTLDs.add(tldText);
|
||||
}
|
||||
}
|
||||
applyTLDVisibility();
|
||||
});
|
||||
});
|
||||
|
||||
// Store TLD in data attribute for headers that don't have it
|
||||
container.querySelectorAll('.tld-header').forEach(header => {
|
||||
const tldText = header.textContent.trim().substring(1); // Remove leading dot
|
||||
const isSelected = showAll || selectedTLDs.has(tldText);
|
||||
if (!header.dataset.tld) {
|
||||
const match = header.textContent.trim().match(/^\.(.+?)(?:\s|$)/);
|
||||
if (match) {
|
||||
header.dataset.tld = match[1];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
applyTLDVisibility();
|
||||
}
|
||||
|
||||
function applyTLDVisibility() {
|
||||
const container = document.querySelector('.domain-list');
|
||||
if (!container) return;
|
||||
|
||||
const showAll = selectedTLDs.size === 0;
|
||||
|
||||
container.querySelectorAll('.tld-header').forEach(header => {
|
||||
const tldText = header.dataset.tld;
|
||||
if (!tldText) return;
|
||||
|
||||
// Determine if this TLD should be expanded
|
||||
let isExpanded;
|
||||
if (selectedTLDs.size > 0) {
|
||||
// Filter mode: show only selected TLDs
|
||||
isExpanded = selectedTLDs.has(tldText);
|
||||
} else {
|
||||
// No filter: show all except manually collapsed
|
||||
isExpanded = !collapsedTLDs.has(tldText);
|
||||
}
|
||||
|
||||
// Find all domain blocks until next TLD header
|
||||
let nextEl = header.nextElementSibling;
|
||||
while (nextEl && !nextEl.classList.contains('tld-header')) {
|
||||
if (nextEl.classList.contains('domain-block')) {
|
||||
nextEl.style.display = isSelected ? 'block' : 'none';
|
||||
nextEl.style.display = isExpanded ? 'block' : 'none';
|
||||
}
|
||||
nextEl = nextEl.nextElementSibling;
|
||||
}
|
||||
|
||||
// Style header based on selection
|
||||
if (isSelected) {
|
||||
header.style.color = '#888';
|
||||
header.style.cursor = 'default';
|
||||
// Style header based on expanded state
|
||||
header.style.color = isExpanded ? '#888' : '#555';
|
||||
|
||||
// Add expand/collapse indicator
|
||||
const indicator = isExpanded ? '▼' : '▶';
|
||||
const tldDisplay = '.' + tldText;
|
||||
if (!header.textContent.includes('▼') && !header.textContent.includes('▶')) {
|
||||
header.innerHTML = `<span style="margin-right: 6px; font-size: 0.8em;">${indicator}</span>${tldDisplay}`;
|
||||
} else {
|
||||
header.style.color = '#444';
|
||||
header.style.cursor = 'pointer';
|
||||
header.innerHTML = header.innerHTML.replace(/[▼▶]/, indicator);
|
||||
}
|
||||
});
|
||||
|
||||
// Make collapsed headers clickable to expand
|
||||
container.querySelectorAll('.tld-header').forEach(header => {
|
||||
// Remove old listener by cloning
|
||||
const newHeader = header.cloneNode(true);
|
||||
header.parentNode.replaceChild(newHeader, header);
|
||||
|
||||
newHeader.addEventListener('click', () => {
|
||||
const tldText = newHeader.textContent.trim().substring(1);
|
||||
if (!selectedTLDs.has(tldText) && selectedTLDs.size > 0) {
|
||||
selectedTLDs.add(tldText);
|
||||
renderTLDButtons();
|
||||
applyTLDFilter();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Stats update
|
||||
|
||||
+540
@@ -0,0 +1,540 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html/template"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PDSAccount represents a Bluesky account on the PDS
|
||||
type PDSAccount struct {
|
||||
DID string `json:"did"`
|
||||
Handle string `json:"handle"`
|
||||
DisplayName string `json:"displayName"`
|
||||
Description string `json:"description"`
|
||||
Avatar string `json:"avatar"`
|
||||
}
|
||||
|
||||
// handleAccountsDirectory serves the 1440.news accounts directory page
|
||||
func (c *Crawler) handleAccountsDirectory(w http.ResponseWriter, r *http.Request) {
|
||||
pdsHost := os.Getenv("PDS_HOST")
|
||||
if pdsHost == "" {
|
||||
pdsHost = "https://pds.1440.news"
|
||||
}
|
||||
|
||||
// Fetch all repos from PDS
|
||||
listReposURL := pdsHost + "/xrpc/com.atproto.sync.listRepos?limit=1000"
|
||||
resp, err := http.Get(listReposURL)
|
||||
if err != nil {
|
||||
http.Error(w, "Failed to fetch accounts: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
var reposResp struct {
|
||||
Repos []struct {
|
||||
DID string `json:"did"`
|
||||
Head string `json:"head"`
|
||||
Active bool `json:"active"`
|
||||
} `json:"repos"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &reposResp); err != nil {
|
||||
http.Error(w, "Failed to parse repos: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch profile for each account using unauthenticated endpoints
|
||||
var accounts []PDSAccount
|
||||
client := &http.Client{Timeout: 5 * time.Second}
|
||||
|
||||
for _, repo := range reposResp.Repos {
|
||||
if !repo.Active {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get handle using describeRepo
|
||||
describeURL := pdsHost + "/xrpc/com.atproto.repo.describeRepo?repo=" + repo.DID
|
||||
describeResp, err := client.Get(describeURL)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
describeBody, _ := io.ReadAll(describeResp.Body)
|
||||
describeResp.Body.Close()
|
||||
|
||||
var repoInfo struct {
|
||||
Handle string `json:"handle"`
|
||||
DID string `json:"did"`
|
||||
}
|
||||
if err := json.Unmarshal(describeBody, &repoInfo); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip the main 1440.news account (directory account itself)
|
||||
if repoInfo.Handle == "1440.news" {
|
||||
continue
|
||||
}
|
||||
|
||||
account := PDSAccount{
|
||||
DID: repoInfo.DID,
|
||||
Handle: repoInfo.Handle,
|
||||
}
|
||||
|
||||
// Get profile record for display name, description, avatar
|
||||
recordURL := pdsHost + "/xrpc/com.atproto.repo.getRecord?repo=" + repo.DID + "&collection=app.bsky.actor.profile&rkey=self"
|
||||
recordResp, err := client.Get(recordURL)
|
||||
if err == nil {
|
||||
recordBody, _ := io.ReadAll(recordResp.Body)
|
||||
recordResp.Body.Close()
|
||||
|
||||
var record struct {
|
||||
Value struct {
|
||||
DisplayName string `json:"displayName"`
|
||||
Description string `json:"description"`
|
||||
Avatar struct {
|
||||
Ref struct {
|
||||
Link string `json:"$link"`
|
||||
} `json:"ref"`
|
||||
} `json:"avatar"`
|
||||
} `json:"value"`
|
||||
}
|
||||
if json.Unmarshal(recordBody, &record) == nil {
|
||||
account.DisplayName = record.Value.DisplayName
|
||||
account.Description = record.Value.Description
|
||||
if record.Value.Avatar.Ref.Link != "" {
|
||||
account.Avatar = pdsHost + "/xrpc/com.atproto.sync.getBlob?did=" + repo.DID + "&cid=" + record.Value.Avatar.Ref.Link
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
accounts = append(accounts, PDSAccount{
|
||||
DID: account.DID,
|
||||
Handle: account.Handle,
|
||||
DisplayName: account.DisplayName,
|
||||
Description: account.Description,
|
||||
Avatar: account.Avatar,
|
||||
})
|
||||
}
|
||||
|
||||
// Render the page
|
||||
tmpl := template.Must(template.New("accounts").Parse(accountsDirectoryHTML))
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
tmpl.Execute(w, map[string]interface{}{
|
||||
"Accounts": accounts,
|
||||
"Count": len(accounts),
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
||||
stats, err := c.GetDashboardStats()
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
funcMap := template.FuncMap{
|
||||
"pct": func(a, b int) float64 {
|
||||
if b == 0 {
|
||||
return 0
|
||||
}
|
||||
return float64(a) * 100.0 / float64(b)
|
||||
},
|
||||
"comma": func(n interface{}) string {
|
||||
var val int
|
||||
switch v := n.(type) {
|
||||
case int:
|
||||
val = v
|
||||
case int32:
|
||||
val = int(v)
|
||||
case int64:
|
||||
val = int(v)
|
||||
default:
|
||||
return "0"
|
||||
}
|
||||
if val < 0 {
|
||||
return "-" + commaFormat(-val)
|
||||
}
|
||||
return commaFormat(val)
|
||||
},
|
||||
}
|
||||
|
||||
tmpl, err := template.New("dashboard").Funcs(funcMap).Parse(dashboardHTML)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
tmpl.Execute(w, stats)
|
||||
}
|
||||
|
||||
func (c *Crawler) handleAPIStats(w http.ResponseWriter, r *http.Request) {
|
||||
stats, err := c.GetDashboardStats()
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(stats)
|
||||
}
|
||||
|
||||
// handleRedirect handles short URL redirects for url.1440.news
|
||||
func (c *Crawler) handleRedirect(w http.ResponseWriter, r *http.Request) {
|
||||
code := strings.TrimPrefix(r.URL.Path, "/")
|
||||
if code == "" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Look up the short URL
|
||||
shortURL, err := c.GetShortURL(code)
|
||||
if err != nil {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Record the click asynchronously
|
||||
go func() {
|
||||
if err := c.RecordClick(code, r); err != nil {
|
||||
fmt.Printf("Failed to record click for %s: %v\n", code, err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Redirect to original URL
|
||||
http.Redirect(w, r, shortURL.OriginalURL, http.StatusFound)
|
||||
}
|
||||
|
||||
const accountsDirectoryHTML = `<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>1440.news - News Feed Directory</title>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style>
|
||||
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
background: #0a0a0a;
|
||||
color: #e0e0e0;
|
||||
min-height: 100vh;
|
||||
padding: 20px;
|
||||
}
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
header {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
padding-bottom: 20px;
|
||||
border-bottom: 1px solid #333;
|
||||
}
|
||||
h1 {
|
||||
font-size: 2.5em;
|
||||
color: #fff;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.tagline {
|
||||
color: #888;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
.count {
|
||||
color: #0af;
|
||||
margin-top: 10px;
|
||||
}
|
||||
.accounts {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
.account {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 15px;
|
||||
padding: 15px;
|
||||
background: #151515;
|
||||
border: 1px solid #252525;
|
||||
border-radius: 12px;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
.account:hover {
|
||||
border-color: #0af;
|
||||
background: #1a1a1a;
|
||||
}
|
||||
.avatar {
|
||||
width: 60px;
|
||||
height: 60px;
|
||||
border-radius: 50%;
|
||||
background: #333;
|
||||
flex-shrink: 0;
|
||||
object-fit: cover;
|
||||
}
|
||||
.avatar-placeholder {
|
||||
width: 60px;
|
||||
height: 60px;
|
||||
border-radius: 50%;
|
||||
background: linear-gradient(135deg, #0af, #08f);
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 24px;
|
||||
color: #fff;
|
||||
font-weight: bold;
|
||||
}
|
||||
.info {
|
||||
flex: 1;
|
||||
min-width: 0;
|
||||
}
|
||||
.display-name {
|
||||
font-size: 1.1em;
|
||||
font-weight: 600;
|
||||
color: #fff;
|
||||
margin-bottom: 2px;
|
||||
}
|
||||
.handle {
|
||||
color: #0af;
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
.description {
|
||||
color: #888;
|
||||
font-size: 0.85em;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.follow-btn {
|
||||
padding: 8px 20px;
|
||||
background: #0af;
|
||||
color: #000;
|
||||
border: none;
|
||||
border-radius: 20px;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
text-decoration: none;
|
||||
flex-shrink: 0;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
.follow-btn:hover {
|
||||
background: #0cf;
|
||||
transform: scale(1.05);
|
||||
}
|
||||
footer {
|
||||
margin-top: 40px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #333;
|
||||
text-align: center;
|
||||
color: #666;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
footer a {
|
||||
color: #0af;
|
||||
text-decoration: none;
|
||||
}
|
||||
.search-box {
|
||||
margin-top: 20px;
|
||||
}
|
||||
.search-input {
|
||||
width: 100%;
|
||||
padding: 12px 20px;
|
||||
font-size: 1em;
|
||||
background: #151515;
|
||||
border: 1px solid #333;
|
||||
border-radius: 25px;
|
||||
color: #fff;
|
||||
outline: none;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
.search-input:focus {
|
||||
border-color: #0af;
|
||||
}
|
||||
.search-input::placeholder {
|
||||
color: #666;
|
||||
}
|
||||
.no-results {
|
||||
text-align: center;
|
||||
color: #666;
|
||||
padding: 40px;
|
||||
display: none;
|
||||
}
|
||||
.account.hidden {
|
||||
display: none;
|
||||
}
|
||||
@media (max-width: 600px) {
|
||||
.account {
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.follow-btn {
|
||||
width: 100%;
|
||||
text-align: center;
|
||||
margin-top: 10px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1>1440.news</h1>
|
||||
<p class="tagline">Curated news feeds on Bluesky</p>
|
||||
<p class="count"><span id="visibleCount">{{.Count}}</span> feeds available</p>
|
||||
<div class="search-box">
|
||||
<input type="text" class="search-input" id="searchInput" placeholder="Search feeds..." autocomplete="off">
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<div class="accounts" id="accountsList">
|
||||
{{range .Accounts}}
|
||||
<div class="account">
|
||||
{{if .Avatar}}
|
||||
<img class="avatar" src="{{.Avatar}}" alt="{{.DisplayName}}">
|
||||
{{else}}
|
||||
<div class="avatar-placeholder">{{slice .Handle 0 1}}</div>
|
||||
{{end}}
|
||||
<div class="info">
|
||||
<div class="display-name">{{if .DisplayName}}{{.DisplayName}}{{else}}{{.Handle}}{{end}}</div>
|
||||
<div class="handle">@{{.Handle}}</div>
|
||||
{{if .Description}}<div class="description">{{.Description}}</div>{{end}}
|
||||
</div>
|
||||
<a class="follow-btn" href="https://bsky.app/profile/{{.Handle}}" target="_blank">View</a>
|
||||
</div>
|
||||
{{else}}
|
||||
<p style="text-align: center; color: #666;">No feeds available yet.</p>
|
||||
{{end}}
|
||||
</div>
|
||||
<p class="no-results" id="noResults">No feeds match your search.</p>
|
||||
|
||||
<footer>
|
||||
<p>Follow <a href="https://bsky.app/profile/1440.news" target="_blank">@1440.news</a> for updates</p>
|
||||
</footer>
|
||||
</div>
|
||||
<script>
|
||||
document.getElementById('searchInput').addEventListener('input', function() {
|
||||
const query = this.value.toLowerCase().trim();
|
||||
const accounts = document.querySelectorAll('.account');
|
||||
let visibleCount = 0;
|
||||
|
||||
accounts.forEach(function(account) {
|
||||
const text = account.textContent.toLowerCase();
|
||||
if (query === '' || text.includes(query)) {
|
||||
account.classList.remove('hidden');
|
||||
visibleCount++;
|
||||
} else {
|
||||
account.classList.add('hidden');
|
||||
}
|
||||
});
|
||||
|
||||
document.getElementById('visibleCount').textContent = visibleCount;
|
||||
document.getElementById('noResults').style.display = visibleCount === 0 && query !== '' ? 'block' : 'none';
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
const dashboardHTML = `<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>1440.news Feed Crawler</title>
|
||||
<meta charset="utf-8">
|
||||
<link rel="stylesheet" href="/static/dashboard.css">
|
||||
<script src="/static/dashboard.js?v=37"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>1440.news Feed Crawler</h1>
|
||||
|
||||
<h2>Domain Status</h2>
|
||||
<div class="grid">
|
||||
<div class="card">
|
||||
<div class="stat-value" id="totalDomains">{{comma .TotalDomains}}</div>
|
||||
<div class="stat-label">Total</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="holdDomains" style="color: #f90;">{{comma .HoldDomains}}</div>
|
||||
<div class="stat-label">Hold</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="passDomains" style="color: #0f0;">{{comma .PassDomains}}</div>
|
||||
<div class="stat-label">Pass</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="skipDomains" style="color: #f66;">{{comma .SkipDomains}}</div>
|
||||
<div class="stat-label">Skip</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="failDomains" style="color: #f00;">{{comma .FailDomains}}</div>
|
||||
<div class="stat-label">Fail</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="crawlRate">{{comma .CrawlRate}}</div>
|
||||
<div class="stat-label">crawls/min</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" id="checkRate">{{comma .CheckRate}}</div>
|
||||
<div class="stat-label">checks/min</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2>Feeds Discovered</h2>
|
||||
<div class="grid">
|
||||
<div class="card">
|
||||
<div class="stat-value" id="totalFeeds">{{comma .TotalFeeds}}</div>
|
||||
<div class="stat-label">Total Feeds</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" style="color: #f90" id="rssFeeds">{{comma .RSSFeeds}}</div>
|
||||
<div class="stat-label">RSS Feeds</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" style="color: #09f" id="atomFeeds">{{comma .AtomFeeds}}</div>
|
||||
<div class="stat-label">Atom Feeds</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="stat-value" style="color: #666" id="unknownFeeds">{{comma .UnknownFeeds}}</div>
|
||||
<div class="stat-label">Unknown Type</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" id="inputCard">
|
||||
<div id="commandButtons" style="margin-bottom: 10px;">
|
||||
<button class="cmd-btn" data-cmd="/domains">domains</button>
|
||||
<button class="cmd-btn" data-cmd="/feeds">feeds</button>
|
||||
<button class="cmd-btn" id="tldToggleBtn">tlds</button>
|
||||
<span style="color: #333; margin: 0 4px;">|</span>
|
||||
<button class="cmd-btn" data-cmd="domains:hold">d:hold</button>
|
||||
<button class="cmd-btn" data-cmd="domains:pass">d:pass</button>
|
||||
<button class="cmd-btn" data-cmd="domains:skip">d:skip</button>
|
||||
<button class="cmd-btn" data-cmd="domains:fail">d:fail</button>
|
||||
<span style="color: #333; margin: 0 4px;">|</span>
|
||||
<button class="cmd-btn" data-cmd="feeds:hold">f:hold</button>
|
||||
<button class="cmd-btn" data-cmd="feeds:pass">f:pass</button>
|
||||
<button class="cmd-btn" data-cmd="feeds:skip">f:skip</button>
|
||||
<button class="cmd-btn" data-cmd="feeds:fail">f:fail</button>
|
||||
</div>
|
||||
<div id="langDropdown" style="display: none; margin-bottom: 10px; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; max-height: 200px; overflow-y: auto;">
|
||||
<div id="langList"></div>
|
||||
</div>
|
||||
<div id="tldDropdown" style="display: none; margin-bottom: 10px; padding: 10px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px;">
|
||||
<div id="tldList" style="display: flex; flex-wrap: wrap; gap: 6px;"></div>
|
||||
</div>
|
||||
<input type="text" id="commandInput" value="/help"
|
||||
style="width: 100%; padding: 12px; background: #0a0a0a; border: 1px solid #333; border-radius: 4px; color: #fff; font-size: 14px; font-family: monospace;">
|
||||
</div>
|
||||
|
||||
<div class="card" id="outputCard">
|
||||
<div id="breadcrumb" style="margin-bottom: 10px; display: none;"></div>
|
||||
<div id="output"></div>
|
||||
</div>
|
||||
|
||||
<div style="color: #333; font-size: 11px; margin-top: 10px;">v18</div>
|
||||
|
||||
<div class="updated" id="updatedAt">Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}</div>
|
||||
</body>
|
||||
</html>`
|
||||
Reference in New Issue
Block a user