diff --git a/CLAUDE.md b/CLAUDE.md index c35196e..5f788cb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -80,7 +80,7 @@ Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`) ### Crawl Logic -1. Domain manually approved (status set to 'pass') +1. Domains import as `pass` by default (auto-crawled) 2. Check stage: HEAD request verifies domain is reachable, sets last_checked_at 3. Crawl stage: Full recursive crawl (HTTPS, fallback HTTP) 4. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10 @@ -101,8 +101,17 @@ Status values: `hold` (default/pending review), `pass` (approved), `skip` (rejec 1. **Check stage** - HEAD request to verify domain is reachable 2. **Crawl stage** - Full recursive crawl for feed discovery -Domain status values: `hold` (pending), `pass` (approved), `skip` (rejected), `fail` (error). -Domains starting with a digit (except 1440.news) are auto-skipped. +Domain status values: +- `pass` (default on import) - Domain is crawled and checked automatically +- `hold` (manual) - Pauses crawling, keeps existing feeds and items +- `skip` (manual) - Takes down PDS accounts (hides posts), marks feeds inactive, preserves all data +- `drop` (manual, via button) - Permanently **deletes** all feeds, items, and PDS accounts (requires skip first) +- `fail` (automatic) - Set when check/crawl fails, keeps existing feeds and items + +Skip vs Drop: +- `skip` is reversible - use "un-skip" to restore accounts and resume publishing +- `drop` is permanent - all data is deleted, cannot be recovered +Auto-skip patterns (imported as `skip`): bare TLDs, domains starting with digit, domains starting with letter-dash. Non-English feeds are auto-skipped. ## AT Protocol Integration diff --git a/api_domains.go b/api_domains.go index dba0eb6..f2f0b8c 100644 --- a/api_domains.go +++ b/api_domains.go @@ -1,9 +1,11 @@ package main import ( + "bufio" "encoding/json" "fmt" "net/http" + "os" "strings" "github.com/jackc/pgx/v5" @@ -326,7 +328,7 @@ func (c *Crawler) handleAPIDomainFeeds(w http.ResponseWriter, r *http.Request) { } // handleAPISetDomainStatus sets the status for a domain -// status must be 'hold', 'pass', 'skip', or 'fail' +// status must be 'hold', 'pass', 'skip', or 'fail' (use /api/dropDomain for 'drop') func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") status := r.URL.Query().Get("status") @@ -336,12 +338,24 @@ func (c *Crawler) handleAPISetDomainStatus(w http.ResponseWriter, r *http.Reques return } if status != "hold" && status != "pass" && status != "skip" && status != "fail" { - http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail'", http.StatusBadRequest) + http.Error(w, "status must be 'hold', 'pass', 'skip', or 'fail' (use /api/dropDomain for permanent deletion)", http.StatusBadRequest) return } host = normalizeHost(host) + // Setting to 'skip' triggers takedown (hide content but preserve data) + if status == "skip" { + result := c.skipDomain(host) + if result.Error != "" { + http.Error(w, result.Error, http.StatusInternalServerError) + return + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) + return + } + // When setting to pass, clear any last_error var err error if status == "pass" { @@ -707,7 +721,7 @@ func (c *Crawler) handleAPITLDStats(w http.ResponseWriter, r *http.Request) { }) } -// handleAPIDenyDomain skips a domain and all its feeds +// handleAPIDenyDomain skips a domain (takedown accounts, preserve data) func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { @@ -715,29 +729,199 @@ func (c *Crawler) handleAPIDenyDomain(w http.ResponseWriter, r *http.Request) { return } - // Update domain status to skip - _, err := c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - // Skip all feeds from this domain - feedsAffected, err := c.db.Exec(`UPDATE feeds SET publish_status = 'skip', status = 'dead' WHERE source_host = $1`, host) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) + result := c.skipDomain(host) + if result.Error != "" { + http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") - json.NewEncoder(w).Encode(map[string]interface{}{ - "success": true, - "host": host, - "feeds_skipped": feedsAffected, - }) + json.NewEncoder(w).Encode(result) } -// handleAPIUndenyDomain removes skip status from a domain +// DomainActionResult contains the results of a domain action +type DomainActionResult struct { + Success bool `json:"success"` + Host string `json:"host"` + Action string `json:"action"` + FeedsAffected int64 `json:"feeds_affected,omitempty"` + ItemsDeleted int64 `json:"items_deleted,omitempty"` + AccountsAffected int `json:"accounts_affected,omitempty"` + AccountErrors []string `json:"account_errors,omitempty"` + Error string `json:"error,omitempty"` +} + +// getPDSCredentials loads PDS credentials from environment or pds.env file +func getPDSCredentials() (pdsHost, pdsAdminPassword string) { + pdsHost = os.Getenv("PDS_HOST") + pdsAdminPassword = os.Getenv("PDS_ADMIN_PASSWORD") + if pdsHost == "" || pdsAdminPassword == "" { + if file, err := os.Open("pds.env"); err == nil { + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "PDS_HOST=") { + pdsHost = strings.TrimPrefix(line, "PDS_HOST=") + } else if strings.HasPrefix(line, "PDS_ADMIN_PASSWORD=") { + pdsAdminPassword = strings.TrimPrefix(line, "PDS_ADMIN_PASSWORD=") + } + } + file.Close() + } + } + return +} + +// getDomainDIDs returns all unique publish_account DIDs for a domain's feeds +func (c *Crawler) getDomainDIDs(host string) []string { + var dids []string + rows, err := c.db.Query(` + SELECT DISTINCT publish_account FROM feeds + WHERE source_host = $1 AND publish_account IS NOT NULL AND publish_account != '' + `, host) + if err == nil { + defer rows.Close() + for rows.Next() { + var did string + if err := rows.Scan(&did); err == nil && did != "" { + dids = append(dids, did) + } + } + } + return dids +} + +// skipDomain sets a domain to skip, takes down PDS accounts but preserves all data +func (c *Crawler) skipDomain(host string) DomainActionResult { + result := DomainActionResult{Host: host, Action: "skip"} + + pdsHost, pdsAdminPassword := getPDSCredentials() + dids := c.getDomainDIDs(host) + + // Takedown PDS accounts (hide content but preserve data) + if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { + publisher := NewPublisher(pdsHost) + for _, did := range dids { + if err := publisher.TakedownAccount(pdsAdminPassword, did, "domain-skip"); err != nil { + result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) + } else { + result.AccountsAffected++ + } + } + } + + // Mark feeds as inactive (but don't delete) + feedsAffected, err := c.db.Exec(` + UPDATE feeds SET status = 'inactive', publish_status = 'skip' + WHERE source_host = $1 + `, host) + if err != nil { + result.Error = fmt.Sprintf("failed to update feeds: %v", err) + return result + } + result.FeedsAffected = feedsAffected + + // Update domain status to skip + _, err = c.db.Exec(`UPDATE domains SET status = 'skip' WHERE host = $1`, host) + if err != nil { + result.Error = fmt.Sprintf("failed to update domain status: %v", err) + return result + } + + result.Success = true + return result +} + +// handleAPIDropDomain permanently deletes all data for a skipped domain +func (c *Crawler) handleAPIDropDomain(w http.ResponseWriter, r *http.Request) { + host := r.URL.Query().Get("host") + if host == "" { + http.Error(w, "host parameter required", http.StatusBadRequest) + return + } + + // Verify domain is currently skipped + var status string + err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status) + if err != nil { + http.Error(w, "domain not found", http.StatusNotFound) + return + } + if status != "skip" { + http.Error(w, "domain must be skipped before dropping", http.StatusBadRequest) + return + } + + result := c.dropDomain(host) + if result.Error != "" { + http.Error(w, result.Error, http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(result) +} + +// dropDomain permanently deletes all data for a domain (feeds, items, PDS accounts) +func (c *Crawler) dropDomain(host string) DomainActionResult { + result := DomainActionResult{Host: host, Action: "drop"} + + pdsHost, pdsAdminPassword := getPDSCredentials() + dids := c.getDomainDIDs(host) + + // Delete PDS accounts + if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { + publisher := NewPublisher(pdsHost) + for _, did := range dids { + if err := publisher.DeleteAccount(pdsAdminPassword, did); err != nil { + result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) + } else { + result.AccountsAffected++ + } + } + } + + // Get feed URLs for this domain (needed to delete items) + var feedURLs []string + feedRows, err := c.db.Query(`SELECT url FROM feeds WHERE source_host = $1`, host) + if err == nil { + defer feedRows.Close() + for feedRows.Next() { + var url string + if err := feedRows.Scan(&url); err == nil { + feedURLs = append(feedURLs, url) + } + } + } + + // Delete items for all feeds from this domain + for _, feedURL := range feedURLs { + deleted, err := c.db.Exec(`DELETE FROM items WHERE feed_url = $1`, feedURL) + if err == nil { + result.ItemsDeleted += deleted + } + } + + // Delete all feeds from this domain + feedsDeleted, err := c.db.Exec(`DELETE FROM feeds WHERE source_host = $1`, host) + if err != nil { + result.Error = fmt.Sprintf("failed to delete feeds: %v", err) + return result + } + result.FeedsAffected = feedsDeleted + + // Update domain status to drop + _, err = c.db.Exec(`UPDATE domains SET status = 'drop' WHERE host = $1`, host) + if err != nil { + result.Error = fmt.Sprintf("failed to update domain status: %v", err) + return result + } + + result.Success = true + return result +} + +// handleAPIUndenyDomain removes skip status from a domain (restores accounts) func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) { host := r.URL.Query().Get("host") if host == "" { @@ -745,24 +929,68 @@ func (c *Crawler) handleAPIUndenyDomain(w http.ResponseWriter, r *http.Request) return } - // Update domain status back to pass - _, err := c.db.Exec(`UPDATE domains SET status = 'pass' WHERE host = $1 AND status = 'skip'`, host) + // Verify domain is currently skipped + var status string + err := c.db.QueryRow(`SELECT status FROM domains WHERE host = $1`, host).Scan(&status) if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) + http.Error(w, "domain not found", http.StatusNotFound) + return + } + if status != "skip" { + http.Error(w, "domain is not skipped", http.StatusBadRequest) return } - // Restore feeds to hold status and active - feedsRestored, err := c.db.Exec(`UPDATE feeds SET publish_status = 'hold', status = 'active' WHERE source_host = $1 AND status = 'dead'`, host) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) + result := c.restoreDomain(host) + if result.Error != "" { + http.Error(w, result.Error, http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") - json.NewEncoder(w).Encode(map[string]interface{}{ - "success": true, - "host": host, - "feeds_restored": feedsRestored, - }) + json.NewEncoder(w).Encode(result) +} + +// restoreDomain removes skip status and restores PDS accounts +func (c *Crawler) restoreDomain(host string) DomainActionResult { + result := DomainActionResult{Host: host, Action: "restore"} + + pdsHost, pdsAdminPassword := getPDSCredentials() + dids := c.getDomainDIDs(host) + + // Restore PDS accounts (remove takedown) + if pdsHost != "" && pdsAdminPassword != "" && len(dids) > 0 { + publisher := NewPublisher(pdsHost) + for _, did := range dids { + if err := publisher.RestoreAccount(pdsAdminPassword, did); err != nil { + result.AccountErrors = append(result.AccountErrors, fmt.Sprintf("%s: %v", did, err)) + } else { + result.AccountsAffected++ + } + } + } + + // Restore feeds to active status + feedsAffected, err := c.db.Exec(` + UPDATE feeds SET status = 'active', publish_status = 'pass' + WHERE source_host = $1 + `, host) + if err != nil { + result.Error = fmt.Sprintf("failed to update feeds: %v", err) + return result + } + result.FeedsAffected = feedsAffected + + // Update domain status back to pass + _, err = c.db.Exec(` + UPDATE domains SET status = 'pass', last_error = NULL + WHERE host = $1 + `, host) + if err != nil { + result.Error = fmt.Sprintf("failed to update domain status: %v", err) + return result + } + + result.Success = true + return result } diff --git a/domain.go b/domain.go index 0c044d2..a15a907 100644 --- a/domain.go +++ b/domain.go @@ -230,7 +230,7 @@ func (c *Crawler) ImportTestDomains(domains []string) { for _, host := range domains { _, err := c.db.Exec(` INSERT INTO domains (host, status, discovered_at, tld) - VALUES ($1, 'hold', $2, $3) + VALUES ($1, 'pass', $2, $3) ON CONFLICT(host) DO NOTHING `, host, now, getTLD(host)) if err != nil { @@ -241,7 +241,7 @@ func (c *Crawler) ImportTestDomains(domains []string) { } } -// ImportDomainsFromFile reads a vertices file and stores new domains as "hold" +// ImportDomainsFromFile reads a vertices file and stores new domains as "pass" func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) { file, err := os.Open(filename) if err != nil { @@ -328,7 +328,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { // Build rows for copy, applying auto-skip for spam patterns rows := make([][]interface{}, len(domains)) for i, d := range domains { - status := "hold" + status := "pass" if shouldAutoSkipDomain(d.host) { status = "skip" } @@ -347,7 +347,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) { if err != nil { // Fall back to individual inserts with ON CONFLICT for _, d := range domains { - status := "hold" + status := "pass" if shouldAutoSkipDomain(d.host) { status = "skip" } @@ -436,7 +436,7 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in // Insert with ON CONFLICT, applying auto-skip for spam patterns for _, d := range domains { - status := "hold" + status := "pass" if shouldAutoSkipDomain(d.host) { status = "skip" } diff --git a/pds_records.go b/pds_records.go index 5a4f62d..97adae7 100644 --- a/pds_records.go +++ b/pds_records.go @@ -270,3 +270,80 @@ func (p *Publisher) DeleteAccount(adminPassword, did string) error { return nil } + +// TakedownAccount applies a takedown to an account (hides content, preserves data) +func (p *Publisher) TakedownAccount(adminPassword, did, reason string) error { + payload := map[string]interface{}{ + "subject": map[string]interface{}{ + "$type": "com.atproto.admin.defs#repoRef", + "did": did, + }, + "takedown": map[string]interface{}{ + "applied": true, + "ref": reason, + }, + } + + body, err := json.Marshal(payload) + if err != nil { + return err + } + + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.admin.updateSubjectStatus", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.SetBasicAuth("admin", adminPassword) + + resp, err := p.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("takedown account failed: %s - %s", resp.Status, string(respBody)) + } + + return nil +} + +// RestoreAccount removes a takedown from an account (makes content visible again) +func (p *Publisher) RestoreAccount(adminPassword, did string) error { + payload := map[string]interface{}{ + "subject": map[string]interface{}{ + "$type": "com.atproto.admin.defs#repoRef", + "did": did, + }, + "takedown": map[string]interface{}{ + "applied": false, + }, + } + + body, err := json.Marshal(payload) + if err != nil { + return err + } + + req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.admin.updateSubjectStatus", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.SetBasicAuth("admin", adminPassword) + + resp, err := p.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("restore account failed: %s - %s", resp.Status, string(respBody)) + } + + return nil +} diff --git a/routes.go b/routes.go index bb63414..ee4beb3 100644 --- a/routes.go +++ b/routes.go @@ -142,6 +142,9 @@ func (c *Crawler) StartDashboard(addr string) error { http.HandleFunc("/api/undenyDomain", func(w http.ResponseWriter, r *http.Request) { c.handleAPIUndenyDomain(w, r) }) + http.HandleFunc("/api/dropDomain", func(w http.ResponseWriter, r *http.Request) { + c.handleAPIDropDomain(w, r) + }) http.HandleFunc("/api/tldStats", func(w http.ResponseWriter, r *http.Request) { c.handleAPITLDStats(w, r) }) diff --git a/static/dashboard.js b/static/dashboard.js index 5e18491..14b0205 100644 --- a/static/dashboard.js +++ b/static/dashboard.js @@ -108,6 +108,14 @@ function initDashboard() { // External link html += ``; + // Drop button (only for skipped domains) + if (status === 'skip') { + html += ``; + } + html += ''; // Feeds under this domain @@ -270,6 +278,43 @@ function initDashboard() { row.addEventListener('mouseenter', () => row.style.background = '#1a1a1a'); row.addEventListener('mouseleave', () => row.style.background = 'transparent'); + // Drop button handler (for skipped domains) + const dropBtn = row.querySelector('.drop-btn'); + if (dropBtn) { + dropBtn.addEventListener('click', async (e) => { + e.stopPropagation(); + const host = dropBtn.dataset.host; + if (!confirm(`Permanently delete all data for ${host}?\n\nThis will:\n- Delete all PDS accounts\n- Delete all feed items\n- Delete all feeds\n\nThis cannot be undone.`)) { + return; + } + dropBtn.disabled = true; + dropBtn.textContent = '...'; + try { + const resp = await fetch(`/api/dropDomain?host=${encodeURIComponent(host)}`); + if (resp.ok) { + const result = await resp.json(); + // Update status to "drop" visually + block.dataset.status = 'drop'; + const statusGroup = row.querySelector('.status-btn-group'); + if (statusGroup) { + statusGroup.innerHTML = 'dropped'; + } + dropBtn.remove(); + console.log('Drop result:', result); + } else { + alert('Drop failed: ' + await resp.text()); + dropBtn.disabled = false; + dropBtn.textContent = 'drop'; + } + } catch (err) { + console.error('Drop failed:', err); + alert('Drop failed: ' + err.message); + dropBtn.disabled = false; + dropBtn.textContent = 'drop'; + } + }); + } + // Handle inline feed clicks - toggle detail block.querySelectorAll('.inline-feed-block').forEach(feedBlock => { const title = feedBlock.querySelector('.feed-title');