Files
crawler/util.go
primal 219b49352e Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization:
  crawler.go, domain.go, feed.go, parser.go, html.go, util.go
- Add PebbleDB for persistent storage of feeds and domains
- Store feeds with metadata: title, TTL, update frequency, ETag, etc.
- Track domains with crawl status (uncrawled/crawled/error)
- Normalize URLs by stripping scheme and www. prefix
- Add web dashboard on port 4321 with real-time stats:
  - Crawl progress with completion percentage
  - Feed counts by type (RSS/Atom)
  - Top TLDs and domains by feed count
  - Recent feeds table
- Filter out comment feeds from results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 16:29:00 -05:00

83 lines
1.7 KiB
Go

package main
import (
"net/url"
"strings"
)
// normalizeURL strips scheme (http/https) and www. prefix to save storage space.
// The normalized URL can be reconstructed with https:// for fetching.
func normalizeURL(rawURL string) string {
// Remove scheme
u := rawURL
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
}
// Remove www. prefix
if strings.HasPrefix(u, "www.") {
u = u[4:]
}
return u
}
// normalizeHost strips www. prefix from a hostname for canonical storage
func normalizeHost(host string) string {
if strings.HasPrefix(host, "www.") {
return host[4:]
}
return host
}
// reverseHost converts a reverse domain notation back to normal
// e.g., "com.example.www" -> "www.example.com"
func reverseHost(reverseHost string) string {
parts := strings.Split(reverseHost, ".")
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
parts[i], parts[j] = parts[j], parts[i]
}
return strings.Join(parts, ".")
}
// getTLD extracts the TLD from a hostname
func getTLD(host string) string {
parts := strings.Split(host, ".")
if len(parts) > 0 {
return parts[len(parts)-1]
}
return ""
}
// makeAbsoluteURL resolves a relative URL against a base URL
func makeAbsoluteURL(href, baseURL string) string {
base, err := url.Parse(baseURL)
if err != nil {
return href
}
link, err := url.Parse(href)
if err != nil {
return href
}
return base.ResolveReference(link).String()
}
// shouldCrawl checks if a link should be crawled (same host as base)
func shouldCrawl(link, baseURL string) bool {
linkURL, err := url.Parse(link)
if err != nil {
return false
}
baseURLParsed, err := url.Parse(baseURL)
if err != nil {
return false
}
return linkURL.Host == baseURLParsed.Host
}