1038 lines
28 KiB
Go
1038 lines
28 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
func (c *Crawler) handleAPIAllDomains(w http.ResponseWriter, r *http.Request) {
|
|
offset := 0
|
|
limit := 100
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 100 {
|
|
limit = 100
|
|
}
|
|
}
|
|
|
|
// Serve from cache (updated once per minute in background)
|
|
c.statsMu.RLock()
|
|
cached := c.cachedAllDomains
|
|
c.statsMu.RUnlock()
|
|
|
|
var domains []DomainStat
|
|
if cached != nil && offset < len(cached) {
|
|
end := offset + limit
|
|
if end > len(cached) {
|
|
end = len(cached)
|
|
}
|
|
domains = cached[offset:end]
|
|
}
|
|
if domains == nil {
|
|
domains = []DomainStat{}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
// handleAPIDomains lists domains with optional status filter, including their feeds
|
|
func (c *Crawler) handleAPIDomains(w http.ResponseWriter, r *http.Request) {
|
|
status := r.URL.Query().Get("status")
|
|
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
|
|
search := r.URL.Query().Get("search")
|
|
tldFilter := r.URL.Query().Get("tld")
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
// First get domains
|
|
var rows pgx.Rows
|
|
var err error
|
|
if hasFeeds {
|
|
// Only domains with feeds
|
|
searchPattern := "%" + strings.ToLower(search) + "%"
|
|
if tldFilter != "" {
|
|
// Filter by specific TLD
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status != 'skip' AND d.tld = $1
|
|
ORDER BY d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, tldFilter, limit, offset)
|
|
} else if search != "" {
|
|
// Search in domain host or feed title/url
|
|
rows, err = c.db.Query(`
|
|
SELECT DISTINCT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
LEFT JOIN feeds fe ON d.host = fe.source_host
|
|
WHERE d.status != 'skip'
|
|
AND (LOWER(d.host) LIKE $1 OR LOWER(fe.title) LIKE $1 OR LOWER(fe.url) LIKE $1)
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, searchPattern, limit, offset)
|
|
} else if status != "" {
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status = $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
} else {
|
|
// Default: exclude 'skip' status domains
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, f.feed_count
|
|
FROM domains d
|
|
INNER JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
WHERE item_count > 0
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status != 'skip'
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $1 OFFSET $2
|
|
`, limit, offset)
|
|
}
|
|
} else if status != "" {
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
|
FROM domains d
|
|
LEFT JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status = $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
} else {
|
|
// Default: exclude 'skip' status domains
|
|
rows, err = c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
|
FROM domains d
|
|
LEFT JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status != 'skip'
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $1 OFFSET $2
|
|
`, limit, offset)
|
|
}
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type FeedInfo struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title,omitempty"`
|
|
Type string `json:"type,omitempty"`
|
|
Status string `json:"status,omitempty"`
|
|
PublishStatus string `json:"publish_status,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
ItemCount int `json:"item_count,omitempty"`
|
|
}
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
Feeds []FeedInfo `json:"feeds,omitempty"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
var hosts []string
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tld, lastError *string
|
|
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tld)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
hosts = append(hosts, d.Host)
|
|
}
|
|
|
|
// Now get feeds for these domains (with actual item count from items table)
|
|
if len(hosts) > 0 {
|
|
feedRows, err := c.db.Query(`
|
|
SELECT f.source_host, f.url, f.title, f.type, f.status, f.publish_status, f.language,
|
|
(SELECT COUNT(*) FROM items WHERE feed_url = f.url) as item_count
|
|
FROM feeds f
|
|
WHERE f.source_host = ANY($1)
|
|
ORDER BY f.source_host, f.url
|
|
`, hosts)
|
|
if err == nil {
|
|
defer feedRows.Close()
|
|
feedsByHost := make(map[string][]FeedInfo)
|
|
for feedRows.Next() {
|
|
var host string
|
|
var f FeedInfo
|
|
var title, feedType, status, publishStatus, language *string
|
|
var itemCount *int
|
|
if err := feedRows.Scan(&host, &f.URL, &title, &feedType, &status, &publishStatus, &language, &itemCount); err != nil {
|
|
continue
|
|
}
|
|
f.Title = StringValue(title)
|
|
f.Type = StringValue(feedType)
|
|
f.Status = StringValue(status)
|
|
f.PublishStatus = StringValue(publishStatus)
|
|
f.Language = StringValue(language)
|
|
if itemCount != nil {
|
|
f.ItemCount = *itemCount
|
|
}
|
|
feedsByHost[host] = append(feedsByHost[host], f)
|
|
}
|
|
// Attach feeds to domains
|
|
for i := range domains {
|
|
if feeds, ok := feedsByHost[domains[i].Host]; ok {
|
|
domains[i].Feeds = feeds
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPIDomainsByStatus(w http.ResponseWriter, r *http.Request) {
|
|
status := r.URL.Query().Get("status")
|
|
if status == "" {
|
|
http.Error(w, "status parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
|
FROM domains d
|
|
LEFT JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.status = $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, status, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tld, lastError *string
|
|
if err := rows.Scan(&d.Host, &tld, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tld)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT url, title, type, status, last_error, item_count, publish_status, language
|
|
FROM feeds
|
|
WHERE source_host = $1
|
|
ORDER BY url ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, host, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type FeedInfo struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
Type string `json:"type"`
|
|
Status string `json:"status,omitempty"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
ItemCount int `json:"item_count,omitempty"`
|
|
PublishStatus string `json:"publish_status,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
}
|
|
|
|
var feeds []FeedInfo
|
|
for rows.Next() {
|
|
var f FeedInfo
|
|
var title, status, lastError, publishStatus, language *string
|
|
var itemCount *int
|
|
if err := rows.Scan(&f.URL, &title, &f.Type, &status, &lastError, &itemCount, &publishStatus, &language); err != nil {
|
|
continue
|
|
}
|
|
f.Title = StringValue(title)
|
|
f.Status = StringValue(status)
|
|
f.LastError = StringValue(lastError)
|
|
f.PublishStatus = StringValue(publishStatus)
|
|
f.Language = StringValue(language)
|
|
if itemCount != nil {
|
|
f.ItemCount = *itemCount
|
|
}
|
|
feeds = append(feeds, f)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(feeds)
|
|
}
|
|
|
|
// handleAPISetDomainStatus sets the status for a domain
|
|
// status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for 'drop')
|
|
func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
status := r.URL.Query().Get("status")
|
|
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
if status != "hold" && status != "pass" && status != "skip" {
|
|
http.Error(w, "status must be 'hold', 'pass', or 'skip' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
host = normalizeHost(host)
|
|
|
|
// Setting to 'skip' triggers takedown (hide content but preserve data)
|
|
if status == "skip" {
|
|
result := c.skipDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
return
|
|
}
|
|
|
|
// When setting to pass, clear any last_error
|
|
var err error
|
|
if status == "pass" {
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = $1, last_error = NULL
|
|
WHERE host = $2
|
|
`, status, host)
|
|
} else {
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = $1
|
|
WHERE host = $2
|
|
`, status, host)
|
|
}
|
|
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]string{
|
|
"host": host,
|
|
"status": status,
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) handleAPIRevisitDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
UPDATE domains SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
|
|
WHERE host = $1
|
|
`, host)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]string{"status": "queued", "host": host})
|
|
}
|
|
|
|
// handleAPIPriorityCrawl immediately crawls a domain (adds it if not exists)
|
|
func (c *Crawler) handleAPIPriorityCrawl(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
host = normalizeHost(host)
|
|
|
|
// Add domain if it doesn't exist, or reset to pass for crawling
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO domains (host, status, discovered_at, tld)
|
|
VALUES ($1, 'pass', NOW(), $2)
|
|
ON CONFLICT(host) DO UPDATE SET status = 'pass', last_checked_at = NULL, last_crawled_at = NULL, last_error = NULL
|
|
`, host, getTLD(host))
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// Crawl synchronously
|
|
fmt.Printf("Priority crawl: %s\n", host)
|
|
feedsFound, crawlErr := c.crawlHost(host)
|
|
|
|
errStr := ""
|
|
if crawlErr != nil {
|
|
errStr = crawlErr.Error()
|
|
}
|
|
|
|
// Mark as crawled
|
|
c.markDomainCrawled(host, feedsFound, errStr)
|
|
|
|
// Get the feeds we found
|
|
feeds, _ := c.GetFeedsByHost(host)
|
|
|
|
type FeedSummary struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
Type string `json:"type"`
|
|
Category string `json:"category"`
|
|
Status string `json:"status"`
|
|
}
|
|
var feedSummaries []FeedSummary
|
|
for _, f := range feeds {
|
|
feedSummaries = append(feedSummaries, FeedSummary{
|
|
URL: f.URL,
|
|
Title: f.Title,
|
|
Type: f.Type,
|
|
Category: f.Category,
|
|
Status: f.Status,
|
|
})
|
|
}
|
|
|
|
result := map[string]interface{}{
|
|
"host": host,
|
|
"feeds_found": feedsFound,
|
|
"feeds": feedSummaries,
|
|
}
|
|
if crawlErr != nil {
|
|
result["error"] = crawlErr.Error()
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// handleAPIFilter handles flexible filtering with stackable parameters
|
|
func (c *Crawler) handleAPIFilter(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
domain := r.URL.Query().Get("domain")
|
|
feedStatus := r.URL.Query().Get("feedStatus")
|
|
domainStatus := r.URL.Query().Get("domainStatus")
|
|
languages := r.URL.Query().Get("languages") // comma-separated list
|
|
show := r.URL.Query().Get("show") // "feeds" or "domains"
|
|
sort := r.URL.Query().Get("sort") // "alpha" or "feeds"
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
// Parse languages into slice
|
|
var langList []string
|
|
if languages != "" {
|
|
for _, lang := range strings.Split(languages, ",") {
|
|
lang = strings.TrimSpace(lang)
|
|
if lang != "" {
|
|
langList = append(langList, lang)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine what to show based on filters
|
|
if show == "" {
|
|
if feedStatus != "" || domain != "" || len(langList) > 0 {
|
|
show = "feeds"
|
|
} else {
|
|
show = "domains"
|
|
}
|
|
}
|
|
|
|
if show == "feeds" {
|
|
c.filterFeeds(w, tld, domain, feedStatus, langList, limit, offset)
|
|
} else {
|
|
c.filterDomains(w, tld, domainStatus, sort, limit, offset)
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) filterDomains(w http.ResponseWriter, tld, status, sort string, limit, offset int) {
|
|
var args []interface{}
|
|
argNum := 1
|
|
query := `
|
|
SELECT d.host, d.tld, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
|
FROM domains d
|
|
LEFT JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE 1=1`
|
|
|
|
if tld != "" {
|
|
query += fmt.Sprintf(" AND d.tld = $%d", argNum)
|
|
args = append(args, tld)
|
|
argNum++
|
|
}
|
|
if status != "" {
|
|
query += fmt.Sprintf(" AND d.status = $%d", argNum)
|
|
args = append(args, status)
|
|
argNum++
|
|
}
|
|
|
|
// Sort by feed count descending or alphabetically
|
|
if sort == "feeds" {
|
|
query += fmt.Sprintf(" ORDER BY feed_count DESC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
|
} else {
|
|
query += fmt.Sprintf(" ORDER BY d.tld ASC, d.host ASC LIMIT $%d OFFSET $%d", argNum, argNum+1)
|
|
}
|
|
args = append(args, limit, offset)
|
|
|
|
rows, err := c.db.Query(query, args...)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
TLD string `json:"tld"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var tldVal, lastError *string
|
|
if err := rows.Scan(&d.Host, &tldVal, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.TLD = StringValue(tldVal)
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"type": "domains",
|
|
"data": domains,
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDDomains(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
if tld == "" {
|
|
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
limit := 100
|
|
offset := 0
|
|
if l := r.URL.Query().Get("limit"); l != "" {
|
|
fmt.Sscanf(l, "%d", &limit)
|
|
if limit > 500 {
|
|
limit = 500
|
|
}
|
|
}
|
|
if o := r.URL.Query().Get("offset"); o != "" {
|
|
fmt.Sscanf(o, "%d", &offset)
|
|
}
|
|
|
|
rows, err := c.db.Query(`
|
|
SELECT d.host, d.status, d.last_error, COALESCE(f.feed_count, 0) as feed_count
|
|
FROM domains d
|
|
LEFT JOIN (
|
|
SELECT source_host, COUNT(*) as feed_count
|
|
FROM feeds
|
|
GROUP BY source_host
|
|
) f ON d.host = f.source_host
|
|
WHERE d.tld = $1
|
|
ORDER BY d.tld ASC, d.host ASC
|
|
LIMIT $2 OFFSET $3
|
|
`, tld, limit, offset)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type DomainInfo struct {
|
|
Host string `json:"host"`
|
|
Status string `json:"status"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
FeedCount int `json:"feed_count"`
|
|
}
|
|
|
|
var domains []DomainInfo
|
|
for rows.Next() {
|
|
var d DomainInfo
|
|
var lastError *string
|
|
if err := rows.Scan(&d.Host, &d.Status, &lastError, &d.FeedCount); err != nil {
|
|
continue
|
|
}
|
|
d.LastError = StringValue(lastError)
|
|
domains = append(domains, d)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(domains)
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDs(w http.ResponseWriter, r *http.Request) {
|
|
hasFeeds := r.URL.Query().Get("has_feeds") == "true"
|
|
|
|
var rows pgx.Rows
|
|
var err error
|
|
|
|
if hasFeeds {
|
|
// Only TLDs that have domains with feeds
|
|
rows, err = c.db.Query(`
|
|
SELECT DISTINCT d.tld, COUNT(DISTINCT d.host) as domain_count
|
|
FROM domains d
|
|
INNER JOIN feeds f ON d.host = f.source_host
|
|
WHERE d.tld IS NOT NULL AND d.tld != ''
|
|
GROUP BY d.tld
|
|
ORDER BY d.tld ASC
|
|
`)
|
|
} else {
|
|
// All TLDs
|
|
rows, err = c.db.Query(`
|
|
SELECT tld, COUNT(*) as domain_count
|
|
FROM domains
|
|
WHERE tld IS NOT NULL AND tld != ''
|
|
GROUP BY tld
|
|
ORDER BY tld ASC
|
|
`)
|
|
}
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
type TLDInfo struct {
|
|
TLD string `json:"tld"`
|
|
DomainCount int `json:"domain_count"`
|
|
}
|
|
|
|
var tlds []TLDInfo
|
|
for rows.Next() {
|
|
var t TLDInfo
|
|
if err := rows.Scan(&t.TLD, &t.DomainCount); err != nil {
|
|
continue
|
|
}
|
|
tlds = append(tlds, t)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(tlds)
|
|
}
|
|
|
|
func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) {
|
|
tld := r.URL.Query().Get("tld")
|
|
if tld == "" {
|
|
http.Error(w, "tld parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
var domainCount, feedCount int
|
|
err := c.db.QueryRow(`SELECT COUNT(*) FROM domains WHERE tld = $1`, tld).Scan(&domainCount)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
err = c.db.QueryRow(`SELECT COUNT(*) FROM feeds WHERE tld = $1`, tld).Scan(&feedCount)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"tld": tld,
|
|
"domain_count": domainCount,
|
|
"feed_count": feedCount,
|
|
})
|
|
}
|
|
|
|
// handleAPIDenyDomain skips a domain (takedown accounts, preserve data)
|
|
func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.skipDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// DomainActionResult contains the results of a domain action
|
|
type DomainActionResult struct {
|
|
Success bool `json:"success"`
|
|
Host string `json:"host"`
|
|
Action string `json:"action"`
|
|
FeedsAffected int64 `json:"feeds_affected,omitempty"`
|
|
ItemsDeleted int64 `json:"items_deleted,omitempty"`
|
|
AccountsAffected int `json:"accounts_affected,omitempty"`
|
|
AccountErrors []string `json:"account_errors,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// getPDSCredentials loads PDS credentials from environment or pds.env file
|
|
func getPDSCredentials() (pdsHost, pdsAdminPassword string) {
|
|
pdsHost = os.Getenv("PDS_HOST")
|
|
pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD")
|
|
if pdsHost == "" || pdsAdminPassword == "" {
|
|
if file, err := os.Open("pds.env"); err == nil {
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if strings.HasPrefix(line, "PDS_HOST=") {
|
|
pdsHost = strings.TrimPrefix(line, "PDS_HOST=")
|
|
} else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") {
|
|
pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=")
|
|
}
|
|
}
|
|
file.Close()
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds
|
|
func (c *Crawler) getDomainDIDs(host string) []string {
|
|
var dids []string
|
|
rows, err := c.db.Query(`
|
|
SELECT DISTINCT publish_account FROM feeds
|
|
WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != ''
|
|
`, host)
|
|
if err == nil {
|
|
defer rows.Close()
|
|
for rows.Next() {
|
|
var did string
|
|
if err := rows.Scan(&did); err == nil && did != "" {
|
|
dids = append(dids, did)
|
|
}
|
|
}
|
|
}
|
|
return dids
|
|
}
|
|
|
|
// skipDomain sets a domain to skip, takes down PDS accounts but preserves all data
|
|
func (c *Crawler) skipDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "skip"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Takedown PDS accounts (hide content but preserve data)
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark feeds as skipped (but don't delete)
|
|
feedsAffected, err := c.db.Exec(`
|
|
UPDATE feeds SET status = 'skip', publish_status = 'skip'
|
|
WHERE source_host = $1
|
|
`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsAffected
|
|
|
|
// Update domain status to skip
|
|
_, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|
|
|
|
// handleAPIDropDomain permanently deletes all data for a skipped domain
|
|
func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Verify domain is currently skipped
|
|
var status string
|
|
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status)
|
|
if err != nil {
|
|
http.Error(w, "domain not found", http.StatusNotFound)
|
|
return
|
|
}
|
|
if status != "skip" {
|
|
http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.dropDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts)
|
|
func (c *Crawler) dropDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "drop"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Delete PDS accounts
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get feed URLs for this domain (needed to delete items)
|
|
var feedURLs []string
|
|
feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host)
|
|
if err == nil {
|
|
defer feedRows.Close()
|
|
for feedRows.Next() {
|
|
var url string
|
|
if err := feedRows.Scan(&url); err == nil {
|
|
feedURLs = append(feedURLs, url)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete items for all feeds from this domain
|
|
for _, feedURL := range feedURLs {
|
|
deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL)
|
|
if err == nil {
|
|
result.ItemsDeleted += deleted
|
|
}
|
|
}
|
|
|
|
// Delete all feeds from this domain
|
|
feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to delete feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsDeleted
|
|
|
|
// Update domain status to drop
|
|
_, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|
|
|
|
// handleAPIUndenyDomain removes skip status from a domain (restores accounts)
|
|
func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) {
|
|
host := r.URL.Query().Get("host")
|
|
if host == "" {
|
|
http.Error(w, "host parameter required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Verify domain is currently skipped
|
|
var status string
|
|
err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status)
|
|
if err != nil {
|
|
http.Error(w, "domain not found", http.StatusNotFound)
|
|
return
|
|
}
|
|
if status != "skip" {
|
|
http.Error(w, "domain is not skipped", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
result := c.restoreDomain(host)
|
|
if result.Error != "" {
|
|
http.Error(w, result.Error, http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(result)
|
|
}
|
|
|
|
// restoreDomain removes skip status and restores PDS accounts
|
|
func (c *Crawler) restoreDomain(host string) DomainActionResult {
|
|
result := DomainActionResult{Host: host, Action: "restore"}
|
|
|
|
pdsHost, pdsAdminPassword := getPDSCredentials()
|
|
dids := c.getDomainDIDs(host)
|
|
|
|
// Restore PDS accounts (remove takedown)
|
|
if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 {
|
|
publisher := NewPublisher(pdsHost)
|
|
for _, did := range dids {
|
|
if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil {
|
|
result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err))
|
|
} else {
|
|
result.AccountsAffected++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Restore feeds to pass status
|
|
feedsAffected, err := c.db.Exec(`
|
|
UPDATE feeds SET status = 'pass', publish_status = 'pass'
|
|
WHERE source_host = $1
|
|
`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update feeds: %v", err)
|
|
return result
|
|
}
|
|
result.FeedsAffected = feedsAffected
|
|
|
|
// Update domain status back to pass
|
|
_, err = c.db.Exec(`
|
|
UPDATE domains SET status = 'pass', last_error = NULL
|
|
WHERE host = $1
|
|
`, host)
|
|
if err != nil {
|
|
result.Error = fmt.Sprintf("failed to update domain status: %v", err)
|
|
return result
|
|
}
|
|
|
|
result.Success = true
|
|
return result
|
|
}
|