crawler/util.go

package main

import (
	"regexp"
	"strings"
)

// normalizeURL strips scheme (http/https) and www. prefix to save storage space.
// The normalized URL can be reconstructed with https:// for fetching.
func normalizeURL(rawURL string) string {
	// Remove scheme
	u := rawURL
	if strings.HasPrefix(u, "https://") {
		u = u[8:]
	} else if strings.HasPrefix(u, "http://") {
		u = u[7:]
	}

	// Remove www. prefix
	if strings.HasPrefix(u, "www.") {
		u = u[4:]
	}

	return u
}

// normalizeHost strips www. prefix from a hostname for canonical storage
func normalizeHost(host string) string {
	if strings.HasPrefix(host, "www.") {
		return host[4:]
	}
	return host
}

// reverseHost converts a reverse domain notation back to normal
// e.g., "com.example.www" -> "www.example.com"
func reverseHost(reverseHost string) string {
	parts := strings.Split(reverseHost, ".")
	for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
		parts[i], parts[j] = parts[j], parts[i]
	}
	return strings.Join(parts, ".")
}

// getTLD extracts the TLD from a hostname
func getTLD(host string) string {
	parts := strings.Split(host, ".")
	if len(parts) > 0 {
		return parts[len(parts)-1]
	}
	return ""
}

// stripTLD removes the TLD suffix from a hostname
// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example"
func stripTLD(host string) string {
	idx := strings.LastIndex(host, ".")
	if idx > 0 {
		return host[:idx]
	}
	return host
}

// getDomainHost extracts the host part from a full domain (without TLD)
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
func getDomainHost(domain string) string {
	return stripTLD(domain)
}

// fullHost reconstructs the full hostname from host and tld
// e.g., ("example", "com") -> "example.com"
func fullHost(host, tld string) string {
	if tld == "" {
		return host
	}
	return host + "." + tld
}

// SearchQuery represents a parsed search with optional type prefix
type SearchQuery struct {
	Type       string // "all", "domain", "url", "title", "description", "item"
	Pattern    string // the search pattern (without prefix)
	ExactMatch bool   // for domain searches: true if TLD was specified (d:npr.org matches exactly)
	// For "all" type searches that look like domains, these are populated for additional exact matching
	DomainHost string // e.g., "npr" from "npr.org"
	DomainTLD  string // e.g., "org" from "npr.org"
}

// parseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:"
// Returns SearchQuery with Type and Pattern
// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern),
//        "url" (f:), "title" (t:), "description" (s:), "item" (i:)
func parseSearchPrefix(query string) SearchQuery {
	query = strings.TrimSpace(query)
	if query == "" {
		return SearchQuery{Type: "all", Pattern: ""}
	}

	// Check for prefixes (case-insensitive)
	lower := strings.ToLower(query)
	if strings.HasPrefix(lower, "a:") {
		return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])}
	}
	if strings.HasPrefix(lower, "d:") {
		return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])}
	}
	if strings.HasPrefix(lower, "f:") {
		return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])}
	}
	if strings.HasPrefix(lower, "t:") {
		return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])}
	}
	if strings.HasPrefix(lower, "s:") {
		return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])}
	}
	if strings.HasPrefix(lower, "i:") {
		return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])}
	}

	// For "all" type, check if pattern looks like a domain and extract host/tld
	result := SearchQuery{Type: "all", Pattern: query}
	if looksLikeDomain(query) {
		host, tld := parseSearchTerm(query)
		if tld != "" {
			result.DomainHost = host
			result.DomainTLD = tld
		}
	}
	return result
}

// looksLikeDomain checks if a query looks like a domain name
func looksLikeDomain(query string) bool {
	if query == "" || strings.Contains(query, " ") {
		return false
	}
	// Must have at least one dot
	lastDot := strings.LastIndex(query, ".")
	if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 {
		return false
	}
	// TLD must be 2-6 lowercase letters
	tld := query[lastDot+1:]
	if len(tld) < 2 || len(tld) > 6 {
		return false
	}
	for _, c := range tld {
		if c < 'a' || c > 'z' {
			if c < 'A' || c > 'Z' {
				return false
			}
		}
	}
	return true
}

// parseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter.
// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them.
// Returns (hostPattern, tldFilter) where tldFilter may be empty.
func parseSearchTerm(search string) (hostPattern, tldFilter string) {
	search = strings.TrimSpace(search)
	if search == "" {
		return "", ""
	}

	// Check if search contains a dot
	lastDot := strings.LastIndex(search, ".")
	if lastDot == -1 || lastDot == len(search)-1 {
		// No dot or ends with dot - treat as host-only search
		return search, ""
	}

	// Extract potential TLD (part after last dot)
	potentialTLD := strings.ToLower(search[lastDot+1:])
	hostPart := search[:lastDot]

	// Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs)
	if len(potentialTLD) < 2 || len(potentialTLD) > 24 {
		return search, ""
	}
	for _, c := range potentialTLD {
		if c < 'a' || c > 'z' {
			// Contains non-letter, not a TLD
			return search, ""
		}
	}

	// Looks like a valid TLD pattern
	return hostPart, potentialTLD
}

// stripHTML removes HTML tags and decodes common entities
func stripHTML(s string) string {
	// Remove HTML tags
	tagRegex := regexp.MustCompile(`<[^>]*>`)
	s = tagRegex.ReplaceAllString(s, "")

	// Decode common HTML entities
	s = strings.ReplaceAll(s, "&amp;", "&")
	s = strings.ReplaceAll(s, "&lt;", "<")
	s = strings.ReplaceAll(s, "&gt;", ">")
	s = strings.ReplaceAll(s, "&quot;", "\"")
	s = strings.ReplaceAll(s, "&#39;", "'")
	s = strings.ReplaceAll(s, "&nbsp;", " ")

	// Normalize whitespace
	spaceRegex := regexp.MustCompile(`\s+`)
	s = spaceRegex.ReplaceAllString(s, " ")

	return strings.TrimSpace(s)
}

// truncateString shortens a string to maxLen, adding "..." if truncated
func truncateString(s string, maxLen int) string {
	if len(s) <= maxLen {
		return s
	}
	return s[:maxLen-3] + "..."
}

// cleanDescription strips HTML and truncates for storage
func cleanDescription(s string) string {
	s = stripHTML(s)
	return truncateString(s, 300)
}