- Replace StartCDXImportLoop with StartCDXMonthlyLoop (runs on 1st of month) - Enable StartFeedCheckLoop, StartCleanupLoop, StartMaintenanceLoop - Remove domain check/crawl loops (CDX provides verified feeds) - Remove vertices.txt import functions (CDX is now sole feed source) - Remove HTML extraction functions (extractFeedLinks, extractAnchorFeeds, etc.) - Remove unused helpers (shouldCrawl, makeAbsoluteURL, GetFeedCountByHost) - Simplify Crawler struct (remove MaxDepth, visited, domain counters) - Add CDX progress tracking to database and dashboard - Net removal: ~720 lines of unused code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
32 lines
688 B
YAML
32 lines
688 B
YAML
services:
|
|
crawler:
|
|
build: .
|
|
image: atproto-1440news-crawler
|
|
container_name: atproto-1440news-crawler
|
|
restart: unless-stopped
|
|
stop_grace_period: 30s
|
|
dns: 172.20.0.53
|
|
env_file:
|
|
- pds.env
|
|
environment:
|
|
DB_HOST: infra-postgres
|
|
DB_PORT: 5432
|
|
DB_USER: dba_1440_news
|
|
DB_PASSWORD_FILE: /run/secrets/db_password
|
|
DB_NAME: db_1440_news
|
|
secrets:
|
|
- db_password
|
|
volumes:
|
|
- ./vertices.txt.gz:/app/vertices.txt.gz:ro
|
|
- ./cdx-data:/app/cdx-data
|
|
networks:
|
|
- atproto
|
|
|
|
secrets:
|
|
db_password:
|
|
file: ../../../infra/postgres/secrets/dba_1440_news_password.txt
|
|
|
|
networks:
|
|
atproto:
|
|
external: true
|