Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// normalizeURL strips scheme (http/https) and www. prefix to save storage space.
|
||||
// The normalized URL can be reconstructed with https:// for fetching.
|
||||
func normalizeURL(rawURL string) string {
|
||||
// Remove scheme
|
||||
u := rawURL
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
}
|
||||
|
||||
// Remove www. prefix
|
||||
if strings.HasPrefix(u, "www.") {
|
||||
u = u[4:]
|
||||
}
|
||||
|
||||
return u
|
||||
}
|
||||
|
||||
// normalizeHost strips www. prefix from a hostname for canonical storage
|
||||
func normalizeHost(host string) string {
|
||||
if strings.HasPrefix(host, "www.") {
|
||||
return host[4:]
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
// reverseHost converts a reverse domain notation back to normal
|
||||
// e.g., "com.example.www" -> "www.example.com"
|
||||
func reverseHost(reverseHost string) string {
|
||||
parts := strings.Split(reverseHost, ".")
|
||||
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
|
||||
parts[i], parts[j] = parts[j], parts[i]
|
||||
}
|
||||
return strings.Join(parts, ".")
|
||||
}
|
||||
|
||||
// getTLD extracts the TLD from a hostname
|
||||
func getTLD(host string) string {
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// makeAbsoluteURL resolves a relative URL against a base URL
|
||||
func makeAbsoluteURL(href, baseURL string) string {
|
||||
base, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
link, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
return base.ResolveReference(link).String()
|
||||
}
|
||||
|
||||
// shouldCrawl checks if a link should be crawled (same host as base)
|
||||
func shouldCrawl(link, baseURL string) bool {
|
||||
linkURL, err := url.Parse(link)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
baseURLParsed, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return linkURL.Host == baseURLParsed.Host
|
||||
}
|
||||
Reference in New Issue
Block a user