- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
83 lines
1.7 KiB
Go
83 lines
1.7 KiB
Go
package main
|
|
|
|
import (
|
|
"net/url"
|
|
"strings"
|
|
)
|
|
|
|
// normalizeURL strips scheme (http/https) and www. prefix to save storage space.
|
|
// The normalized URL can be reconstructed with https:// for fetching.
|
|
func normalizeURL(rawURL string) string {
|
|
// Remove scheme
|
|
u := rawURL
|
|
if strings.HasPrefix(u, "https://") {
|
|
u = u[8:]
|
|
} else if strings.HasPrefix(u, "http://") {
|
|
u = u[7:]
|
|
}
|
|
|
|
// Remove www. prefix
|
|
if strings.HasPrefix(u, "www.") {
|
|
u = u[4:]
|
|
}
|
|
|
|
return u
|
|
}
|
|
|
|
// normalizeHost strips www. prefix from a hostname for canonical storage
|
|
func normalizeHost(host string) string {
|
|
if strings.HasPrefix(host, "www.") {
|
|
return host[4:]
|
|
}
|
|
return host
|
|
}
|
|
|
|
// reverseHost converts a reverse domain notation back to normal
|
|
// e.g., "com.example.www" -> "www.example.com"
|
|
func reverseHost(reverseHost string) string {
|
|
parts := strings.Split(reverseHost, ".")
|
|
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
|
|
parts[i], parts[j] = parts[j], parts[i]
|
|
}
|
|
return strings.Join(parts, ".")
|
|
}
|
|
|
|
// getTLD extracts the TLD from a hostname
|
|
func getTLD(host string) string {
|
|
parts := strings.Split(host, ".")
|
|
if len(parts) > 0 {
|
|
return parts[len(parts)-1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// makeAbsoluteURL resolves a relative URL against a base URL
|
|
func makeAbsoluteURL(href, baseURL string) string {
|
|
base, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
link, err := url.Parse(href)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
return base.ResolveReference(link).String()
|
|
}
|
|
|
|
// shouldCrawl checks if a link should be crawled (same host as base)
|
|
func shouldCrawl(link, baseURL string) bool {
|
|
linkURL, err := url.Parse(link)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
baseURLParsed, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
return linkURL.Host == baseURLParsed.Host
|
|
}
|