Restore working codebase with all methods
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+91
-41
@@ -1,9 +1,11 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
@@ -15,23 +17,22 @@ import (
|
||||
)
|
||||
|
||||
type Crawler struct {
|
||||
MaxDepth int
|
||||
MaxPagesPerHost int
|
||||
Timeout time.Duration
|
||||
UserAgent string
|
||||
visited sync.Map
|
||||
feedsMu sync.Mutex
|
||||
client *http.Client
|
||||
hostsProcessed int32
|
||||
feedsChecked int32
|
||||
startTime time.Time
|
||||
db *DB
|
||||
displayedCrawlRate int
|
||||
displayedCheckRate int
|
||||
domainsImported int32
|
||||
cachedStats *DashboardStats
|
||||
cachedAllDomains []DomainStat
|
||||
statsMu sync.RWMutex
|
||||
MaxDepth int
|
||||
MaxPagesPerHost int
|
||||
Timeout time.Duration
|
||||
UserAgent string
|
||||
visited sync.Map
|
||||
feedsMu sync.Mutex
|
||||
client *http.Client
|
||||
domainsCrawled int32 // feed_crawl: domains crawled for feed discovery
|
||||
domainsChecked int32 // domain_check: domains checked for liveness
|
||||
feedsChecked int32 // feed_check: feeds checked for new items
|
||||
startTime time.Time
|
||||
db *DB
|
||||
domainsImported int32
|
||||
cachedStats *DashboardStats
|
||||
cachedAllDomains []DomainStat
|
||||
statsMu sync.RWMutex
|
||||
}
|
||||
|
||||
func NewCrawler(connString string) (*Crawler, error) {
|
||||
@@ -467,43 +468,92 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
|
||||
return items, nil
|
||||
}
|
||||
|
||||
// StartCrawlLoop runs the domain crawling loop independently
|
||||
func (c *Crawler) StartCrawlLoop() {
|
||||
numWorkers := 100
|
||||
// dnsResolver uses local caching DNS (infra-dns) with fallback to system
|
||||
var dnsResolver = &net.Resolver{
|
||||
PreferGo: true,
|
||||
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
||||
d := net.Dialer{Timeout: 2 * time.Second}
|
||||
// Try local caching DNS first (CoreDNS on proxy network)
|
||||
conn, err := d.DialContext(ctx, "udp", "infra-dns:53")
|
||||
if err == nil {
|
||||
return conn, nil
|
||||
}
|
||||
// Fallback to system DNS
|
||||
return d.DialContext(ctx, network, address)
|
||||
},
|
||||
}
|
||||
|
||||
// domainCheck performs a DNS lookup to check if a domain resolves
|
||||
func (c *Crawler) domainCheck(host string) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_, err := dnsResolver.LookupHost(ctx, host)
|
||||
return err
|
||||
}
|
||||
|
||||
// StartDomainLoop runs the domain processing loop (domain_check + feed_crawl)
|
||||
func (c *Crawler) StartDomainLoop() {
|
||||
numWorkers := 1000
|
||||
|
||||
// Buffered channel for domain work
|
||||
workChan := make(chan *Domain, 100)
|
||||
workChan := make(chan *Domain, 1000)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < numWorkers; i++ {
|
||||
go func() {
|
||||
for domain := range workChan {
|
||||
feedsFound, crawlErr := c.crawlHost(domain.Host)
|
||||
errStr := ""
|
||||
if crawlErr != nil {
|
||||
errStr = crawlErr.Error()
|
||||
}
|
||||
if err := c.markDomainCrawled(domain.Host, feedsFound, errStr); err != nil {
|
||||
fmt.Printf("Error marking domain %s as crawled: %v\n", domain.Host, err)
|
||||
fh := domain.FullHost()
|
||||
if domain.CrawledAt.Equal(DomainStateUnchecked) {
|
||||
// domain_check: DNS lookup for liveness
|
||||
err := c.domainCheck(fh)
|
||||
errStr := ""
|
||||
if err != nil {
|
||||
errStr = err.Error()
|
||||
}
|
||||
if err := c.markDomainChecked(domain.Host, domain.TLD, errStr); err != nil {
|
||||
fmt.Printf("Error marking domain %s as checked: %v\n", fh, err)
|
||||
}
|
||||
atomic.AddInt32(&c.domainsChecked, 1)
|
||||
} else {
|
||||
// feed_crawl: crawl domain to discover feeds
|
||||
feedsFound, crawlErr := c.feedCrawl(fh)
|
||||
errStr := ""
|
||||
if crawlErr != nil {
|
||||
errStr = crawlErr.Error()
|
||||
}
|
||||
if err := c.markDomainCrawled(domain.Host, domain.TLD, feedsFound, errStr); err != nil {
|
||||
fmt.Printf("Error marking domain %s as crawled: %v\n", fh, err)
|
||||
}
|
||||
atomic.AddInt32(&c.domainsCrawled, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
const fetchSize = 100
|
||||
const fetchSize = 1000
|
||||
for {
|
||||
domains, err := c.GetDomainsToCrawl(fetchSize)
|
||||
domains, err := c.GetDomainsToProcess(fetchSize)
|
||||
if err != nil {
|
||||
fmt.Printf("Error fetching domains to crawl: %v\n", err)
|
||||
fmt.Printf("Error fetching domains to process: %v\n", err)
|
||||
}
|
||||
|
||||
if len(domains) == 0 {
|
||||
c.displayedCrawlRate = 0
|
||||
time.Sleep(1 * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Printf("%s crawl: %d domains to crawl\n", time.Now().Format("15:04:05"), len(domains))
|
||||
// Count unchecked vs checked for logging
|
||||
unchecked := 0
|
||||
for _, d := range domains {
|
||||
if d.CrawledAt.Equal(DomainStateUnchecked) {
|
||||
unchecked++
|
||||
}
|
||||
}
|
||||
checked := len(domains) - unchecked
|
||||
|
||||
if unchecked > 0 || checked > 0 {
|
||||
fmt.Printf("%s domain: %d domain_check, %d feed_crawl\n", time.Now().Format("15:04:05"), unchecked, checked)
|
||||
}
|
||||
|
||||
for _, domain := range domains {
|
||||
workChan <- domain
|
||||
@@ -513,12 +563,12 @@ func (c *Crawler) StartCrawlLoop() {
|
||||
}
|
||||
}
|
||||
|
||||
// StartCheckLoop runs the feed checking loop independently
|
||||
func (c *Crawler) StartCheckLoop() {
|
||||
numWorkers := 100
|
||||
// StartFeedCheckLoop runs the feed_check loop (checking feeds for new items)
|
||||
func (c *Crawler) StartFeedCheckLoop() {
|
||||
numWorkers := 1000
|
||||
|
||||
// Buffered channel for feed work
|
||||
workChan := make(chan *Feed, 100)
|
||||
workChan := make(chan *Feed, 1000)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < numWorkers; i++ {
|
||||
@@ -537,12 +587,11 @@ func (c *Crawler) StartCheckLoop() {
|
||||
}
|
||||
|
||||
if len(feeds) == 0 {
|
||||
c.displayedCheckRate = 0
|
||||
time.Sleep(1 * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Printf("%s check: %d feeds to check\n", time.Now().Format("15:04:05"), len(feeds))
|
||||
fmt.Printf("%s feed_check: %d feeds\n", time.Now().Format("15:04:05"), len(feeds))
|
||||
|
||||
for _, feed := range feeds {
|
||||
workChan <- feed
|
||||
@@ -552,8 +601,9 @@ func (c *Crawler) StartCheckLoop() {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) crawlHost(host string) (feedsFound int, err error) {
|
||||
atomic.AddInt32(&c.hostsProcessed, 1)
|
||||
// feedCrawl crawls a domain to discover RSS/Atom feeds
|
||||
func (c *Crawler) feedCrawl(host string) (feedsFound int, err error) {
|
||||
atomic.AddInt32(&c.domainsCrawled, 1)
|
||||
|
||||
localVisited := make(map[string]bool)
|
||||
pagesVisited := 0
|
||||
|
||||
Reference in New Issue
Block a user