crawler/util.go

package main

import (
	"net/url"
	"strings"
)

// normalizeURL strips scheme (http/https) and www. prefix to save storage space.
// The normalized URL can be reconstructed with https:// for fetching.
func normalizeURL(rawURL string) string {
	// Remove scheme
	u := rawURL
	if strings.HasPrefix(u, "https://") {
		u = u[8:]
	} else if strings.HasPrefix(u, "http://") {
		u = u[7:]
	}

	// Remove www. prefix
	if strings.HasPrefix(u, "www.") {
		u = u[4:]
	}

	return u
}

// normalizeHost strips www. prefix from a hostname for canonical storage
func normalizeHost(host string) string {
	if strings.HasPrefix(host, "www.") {
		return host[4:]
	}
	return host
}

// reverseHost converts a reverse domain notation back to normal
// e.g., "com.example.www" -> "www.example.com"
func reverseHost(reverseHost string) string {
	parts := strings.Split(reverseHost, ".")
	for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
		parts[i], parts[j] = parts[j], parts[i]
	}
	return strings.Join(parts, ".")
}

// getTLD extracts the TLD from a hostname
func getTLD(host string) string {
	parts := strings.Split(host, ".")
	if len(parts) > 0 {
		return parts[len(parts)-1]
	}
	return ""
}

// makeAbsoluteURL resolves a relative URL against a base URL
func makeAbsoluteURL(href, baseURL string) string {
	base, err := url.Parse(baseURL)
	if err != nil {
		return href
	}

	link, err := url.Parse(href)
	if err != nil {
		return href
	}

	return base.ResolveReference(link).String()
}

// shouldCrawl checks if a link should be crawled (same host as base)
func shouldCrawl(link, baseURL string) bool {
	linkURL, err := url.Parse(link)
	if err != nil {
		return false
	}

	baseURLParsed, err := url.Parse(baseURL)
	if err != nil {
		return false
	}

	return linkURL.Host == baseURLParsed.Host
}