All files now use `package commons` instead of `package shared` to match the module name and directory. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
243 lines
6.7 KiB
Go
243 lines
6.7 KiB
Go
package commons
|
|
|
|
import (
|
|
"net/url"
|
|
"strings"
|
|
)
|
|
|
|
// NormalizeURL strips scheme (http/https) and www. prefix to save storage space.
|
|
// The normalized URL can be reconstructed with https:// for fetching.
|
|
func NormalizeURL(rawURL string) string {
|
|
// Remove scheme
|
|
u := rawURL
|
|
if strings.HasPrefix(u, "https://") {
|
|
u = u[8:]
|
|
} else if strings.HasPrefix(u, "http://") {
|
|
u = u[7:]
|
|
}
|
|
|
|
// Remove www. prefix
|
|
if strings.HasPrefix(u, "www.") {
|
|
u = u[4:]
|
|
}
|
|
|
|
return u
|
|
}
|
|
|
|
// NormalizeHost strips www. prefix from a hostname for canonical storage
|
|
func NormalizeHost(host string) string {
|
|
if strings.HasPrefix(host, "www.") {
|
|
return host[4:]
|
|
}
|
|
return host
|
|
}
|
|
|
|
// ReverseHost converts a reverse domain notation back to normal
|
|
// e.g., "com.example.www" -> "www.example.com"
|
|
func ReverseHost(reverseHost string) string {
|
|
parts := strings.Split(reverseHost, ".")
|
|
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
|
|
parts[i], parts[j] = parts[j], parts[i]
|
|
}
|
|
return strings.Join(parts, ".")
|
|
}
|
|
|
|
// GetTLD extracts the TLD from a hostname
|
|
func GetTLD(host string) string {
|
|
parts := strings.Split(host, ".")
|
|
if len(parts) > 0 {
|
|
return parts[len(parts)-1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// StripTLD removes the TLD suffix from a hostname
|
|
// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example"
|
|
func StripTLD(host string) string {
|
|
idx := strings.LastIndex(host, ".")
|
|
if idx > 0 {
|
|
return host[:idx]
|
|
}
|
|
return host
|
|
}
|
|
|
|
// GetDomainHost extracts the host part from a full domain (without TLD)
|
|
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
|
|
func GetDomainHost(domain string) string {
|
|
return StripTLD(domain)
|
|
}
|
|
|
|
// FullHost reconstructs the full hostname from host and tld
|
|
// e.g., ("example", "com") -> "example.com"
|
|
func FullHost(host, tld string) string {
|
|
if tld == "" {
|
|
return host
|
|
}
|
|
return host + "." + tld
|
|
}
|
|
|
|
// MakeAbsoluteURL resolves a relative URL against a base URL
|
|
func MakeAbsoluteURL(href, baseURL string) string {
|
|
base, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
link, err := url.Parse(href)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
return base.ResolveReference(link).String()
|
|
}
|
|
|
|
// SearchQuery represents a parsed search with optional type prefix
|
|
type SearchQuery struct {
|
|
Type string // "all", "domain", "url", "title", "description", "item"
|
|
Pattern string // the search pattern (without prefix)
|
|
ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly)
|
|
// For "all" type searches that look like domains, these are populated for additional exact matching
|
|
DomainHost string // e.g., "npr" from "npr.org"
|
|
DomainTLD string // e.g., "org" from "npr.org"
|
|
}
|
|
|
|
// ParseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:"
|
|
// Returns SearchQuery with Type and Pattern
|
|
// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern),
|
|
//
|
|
// "url" (f:), "title" (t:), "description" (s:), "item" (i:)
|
|
func ParseSearchPrefix(query string) SearchQuery {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return SearchQuery{Type: "all", Pattern: ""}
|
|
}
|
|
|
|
// Check for prefixes (case-insensitive)
|
|
lower := strings.ToLower(query)
|
|
if strings.HasPrefix(lower, "a:") {
|
|
return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
if strings.HasPrefix(lower, "d:") {
|
|
return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
if strings.HasPrefix(lower, "f:") {
|
|
return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
if strings.HasPrefix(lower, "t:") {
|
|
return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
if strings.HasPrefix(lower, "s:") {
|
|
return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
if strings.HasPrefix(lower, "i:") {
|
|
return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])}
|
|
}
|
|
|
|
// For "all" type, check if pattern looks like a domain and extract host/tld
|
|
result := SearchQuery{Type: "all", Pattern: query}
|
|
if LooksLikeDomain(query) {
|
|
host, tld := ParseSearchTerm(query)
|
|
if tld != "" {
|
|
result.DomainHost = host
|
|
result.DomainTLD = tld
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// LooksLikeDomain checks if a query looks like a domain name
|
|
func LooksLikeDomain(query string) bool {
|
|
if query == "" || strings.Contains(query, " ") {
|
|
return false
|
|
}
|
|
// Must have at least one dot
|
|
lastDot := strings.LastIndex(query, ".")
|
|
if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 {
|
|
return false
|
|
}
|
|
// TLD must be 2-6 lowercase letters
|
|
tld := query[lastDot+1:]
|
|
if len(tld) < 2 || len(tld) > 6 {
|
|
return false
|
|
}
|
|
for _, c := range tld {
|
|
if c < 'a' || c > 'z' {
|
|
if c < 'A' || c > 'Z' {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// ParseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter.
|
|
// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them.
|
|
// Returns (hostPattern, tldFilter) where tldFilter may be empty.
|
|
func ParseSearchTerm(search string) (hostPattern, tldFilter string) {
|
|
search = strings.TrimSpace(search)
|
|
if search == "" {
|
|
return "", ""
|
|
}
|
|
|
|
// Check if search contains a dot
|
|
lastDot := strings.LastIndex(search, ".")
|
|
if lastDot == -1 || lastDot == len(search)-1 {
|
|
// No dot or ends with dot - treat as host-only search
|
|
return search, ""
|
|
}
|
|
|
|
// Extract potential TLD (part after last dot)
|
|
potentialTLD := strings.ToLower(search[lastDot+1:])
|
|
hostPart := search[:lastDot]
|
|
|
|
// Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs)
|
|
if len(potentialTLD) < 2 || len(potentialTLD) > 24 {
|
|
return search, ""
|
|
}
|
|
for _, c := range potentialTLD {
|
|
if c < 'a' || c > 'z' {
|
|
// Contains non-letter, not a TLD
|
|
return search, ""
|
|
}
|
|
}
|
|
|
|
// Looks like a valid TLD pattern
|
|
return hostPart, potentialTLD
|
|
}
|
|
|
|
// ShouldCrawl checks if a link should be crawled (same host as base)
|
|
func ShouldCrawl(link, baseURL string) bool {
|
|
linkURL, err := url.Parse(link)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
baseURLParsed, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
return linkURL.Host == baseURLParsed.Host
|
|
}
|
|
|
|
// ShouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
|
|
func ShouldAutoSkipDomain(host string) bool {
|
|
// Never skip our own domain
|
|
if strings.HasSuffix(host, "1440.news") || host == "1440.news" {
|
|
return false
|
|
}
|
|
// Skip bare TLDs (no dot means it's just "com", "net", etc.)
|
|
if !strings.Contains(host, ".") {
|
|
return true
|
|
}
|
|
// Skip domains starting with a digit (spam pattern)
|
|
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
|
|
return true
|
|
}
|
|
// Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com")
|
|
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
|
|
return true
|
|
}
|
|
return false
|
|
}
|