package main import ( "net/url" "strings" ) // normalizeURL strips scheme (http/https) and www. prefix to save storage space. // The normalized URL can be reconstructed with https:// for fetching. func normalizeURL(rawURL string) string { // Remove scheme u := rawURL if strings.HasPrefix(u, "https://") { u = u[8:] } else if strings.HasPrefix(u, "http://") { u = u[7:] } // Remove www. prefix if strings.HasPrefix(u, "www.") { u = u[4:] } return u } // normalizeHost strips www. prefix from a hostname for canonical storage func normalizeHost(host string) string { if strings.HasPrefix(host, "www.") { return host[4:] } return host } // reverseHost converts a reverse domain notation back to normal // e.g., "com.example.www" -> "www.example.com" func reverseHost(reverseHost string) string { parts := strings.Split(reverseHost, ".") for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { parts[i], parts[j] = parts[j], parts[i] } return strings.Join(parts, ".") } // getTLD extracts the TLD from a hostname func getTLD(host string) string { parts := strings.Split(host, ".") if len(parts) > 0 { return parts[len(parts)-1] } return "" } // makeAbsoluteURL resolves a relative URL against a base URL func makeAbsoluteURL(href, baseURL string) string { base, err := url.Parse(baseURL) if err != nil { return href } link, err := url.Parse(href) if err != nil { return href } return base.ResolveReference(link).String() } // shouldCrawl checks if a link should be crawled (same host as base) func shouldCrawl(link, baseURL string) bool { linkURL, err := url.Parse(link) if err != nil { return false } baseURLParsed, err := url.Parse(baseURL) if err != nil { return false } return linkURL.Host == baseURLParsed.Host }