package shared import ( "net/url" "strings" ) // NormalizeURL strips scheme (http/https) and www. prefix to save storage space. // The normalized URL can be reconstructed with https:// for fetching. func NormalizeURL(rawURL string) string { // Remove scheme u := rawURL if strings.HasPrefix(u, "https://") { u = u[8:] } else if strings.HasPrefix(u, "http://") { u = u[7:] } // Remove www. prefix if strings.HasPrefix(u, "www.") { u = u[4:] } return u } // NormalizeHost strips www. prefix from a hostname for canonical storage func NormalizeHost(host string) string { if strings.HasPrefix(host, "www.") { return host[4:] } return host } // ReverseHost converts a reverse domain notation back to normal // e.g., "com.example.www" -> "www.example.com" func ReverseHost(reverseHost string) string { parts := strings.Split(reverseHost, ".") for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { parts[i], parts[j] = parts[j], parts[i] } return strings.Join(parts, ".") } // GetTLD extracts the TLD from a hostname func GetTLD(host string) string { parts := strings.Split(host, ".") if len(parts) > 0 { return parts[len(parts)-1] } return "" } // StripTLD removes the TLD suffix from a hostname // e.g., "example.com" -> "example", "sub.example.com" -> "sub.example" func StripTLD(host string) string { idx := strings.LastIndex(host, ".") if idx > 0 { return host[:idx] } return host } // GetDomainHost extracts the host part from a full domain (without TLD) // e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co" func GetDomainHost(domain string) string { return StripTLD(domain) } // FullHost reconstructs the full hostname from host and tld // e.g., ("example", "com") -> "example.com" func FullHost(host, tld string) string { if tld == "" { return host } return host + "." + tld } // MakeAbsoluteURL resolves a relative URL against a base URL func MakeAbsoluteURL(href, baseURL string) string { base, err := url.Parse(baseURL) if err != nil { return href } link, err := url.Parse(href) if err != nil { return href } return base.ResolveReference(link).String() } // SearchQuery represents a parsed search with optional type prefix type SearchQuery struct { Type string // "all", "domain", "url", "title", "description", "item" Pattern string // the search pattern (without prefix) ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly) // For "all" type searches that look like domains, these are populated for additional exact matching DomainHost string // e.g., "npr" from "npr.org" DomainTLD string // e.g., "org" from "npr.org" } // ParseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:" // Returns SearchQuery with Type and Pattern // Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern), // // "url" (f:), "title" (t:), "description" (s:), "item" (i:) func ParseSearchPrefix(query string) SearchQuery { query = strings.TrimSpace(query) if query == "" { return SearchQuery{Type: "all", Pattern: ""} } // Check for prefixes (case-insensitive) lower := strings.ToLower(query) if strings.HasPrefix(lower, "a:") { return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])} } if strings.HasPrefix(lower, "d:") { return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])} } if strings.HasPrefix(lower, "f:") { return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])} } if strings.HasPrefix(lower, "t:") { return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])} } if strings.HasPrefix(lower, "s:") { return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])} } if strings.HasPrefix(lower, "i:") { return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])} } // For "all" type, check if pattern looks like a domain and extract host/tld result := SearchQuery{Type: "all", Pattern: query} if LooksLikeDomain(query) { host, tld := ParseSearchTerm(query) if tld != "" { result.DomainHost = host result.DomainTLD = tld } } return result } // LooksLikeDomain checks if a query looks like a domain name func LooksLikeDomain(query string) bool { if query == "" || strings.Contains(query, " ") { return false } // Must have at least one dot lastDot := strings.LastIndex(query, ".") if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 { return false } // TLD must be 2-6 lowercase letters tld := query[lastDot+1:] if len(tld) < 2 || len(tld) > 6 { return false } for _, c := range tld { if c < 'a' || c > 'z' { if c < 'A' || c > 'Z' { return false } } } return true } // ParseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter. // If the search ends with what looks like a TLD (e.g., "example.com"), it splits them. // Returns (hostPattern, tldFilter) where tldFilter may be empty. func ParseSearchTerm(search string) (hostPattern, tldFilter string) { search = strings.TrimSpace(search) if search == "" { return "", "" } // Check if search contains a dot lastDot := strings.LastIndex(search, ".") if lastDot == -1 || lastDot == len(search)-1 { // No dot or ends with dot - treat as host-only search return search, "" } // Extract potential TLD (part after last dot) potentialTLD := strings.ToLower(search[lastDot+1:]) hostPart := search[:lastDot] // Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs) if len(potentialTLD) < 2 || len(potentialTLD) > 24 { return search, "" } for _, c := range potentialTLD { if c < 'a' || c > 'z' { // Contains non-letter, not a TLD return search, "" } } // Looks like a valid TLD pattern return hostPart, potentialTLD } // ShouldCrawl checks if a link should be crawled (same host as base) func ShouldCrawl(link, baseURL string) bool { linkURL, err := url.Parse(link) if err != nil { return false } baseURLParsed, err := url.Parse(baseURL) if err != nil { return false } return linkURL.Host == baseURLParsed.Host } // ShouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns func ShouldAutoSkipDomain(host string) bool { // Never skip our own domain if strings.HasSuffix(host, "1440.news") || host == "1440.news" { return false } // Skip bare TLDs (no dot means it's just "com", "net", etc.) if !strings.Contains(host, ".") { return true } // Skip domains starting with a digit (spam pattern) if len(host) > 0 && host[0] >= '0' && host[0] <= '9' { return true } // Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com") if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' { return true } return false }