Add exact domain matching for domain-like search queries
When searching for patterns like "npr.org", the search now also matches the exact domain (host=npr, tld=org) in addition to the existing text search across domain names, feed URLs, titles, and descriptions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+1112
-112
File diff suppressed because it is too large
Load Diff
@@ -51,6 +51,25 @@ func getTLD(host string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// stripTLD removes the TLD suffix from a hostname
|
||||
// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example"
|
||||
func stripTLD(host string) string {
|
||||
idx := strings.LastIndex(host, ".")
|
||||
if idx > 0 {
|
||||
return host[:idx]
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
// fullHost reconstructs the full hostname from host and tld
|
||||
// e.g., ("example", "com") -> "example.com"
|
||||
func fullHost(host, tld string) string {
|
||||
if tld == "" {
|
||||
return host
|
||||
}
|
||||
return host + "." + tld
|
||||
}
|
||||
|
||||
// makeAbsoluteURL resolves a relative URL against a base URL
|
||||
func makeAbsoluteURL(href, baseURL string) string {
|
||||
base, err := url.Parse(baseURL)
|
||||
@@ -66,6 +85,119 @@ func makeAbsoluteURL(href, baseURL string) string {
|
||||
return base.ResolveReference(link).String()
|
||||
}
|
||||
|
||||
// SearchQuery represents a parsed search with optional type prefix
|
||||
type SearchQuery struct {
|
||||
Type string // "all", "domain", "url", "title", "description", "item"
|
||||
Pattern string // the search pattern (without prefix)
|
||||
ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly)
|
||||
// For "all" type searches that look like domains, these are populated for additional exact matching
|
||||
DomainHost string // e.g., "npr" from "npr.org"
|
||||
DomainTLD string // e.g., "org" from "npr.org"
|
||||
}
|
||||
|
||||
// parseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:"
|
||||
// Returns SearchQuery with Type and Pattern
|
||||
// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern),
|
||||
// "url" (f:), "title" (t:), "description" (s:), "item" (i:)
|
||||
func parseSearchPrefix(query string) SearchQuery {
|
||||
query = strings.TrimSpace(query)
|
||||
if query == "" {
|
||||
return SearchQuery{Type: "all", Pattern: ""}
|
||||
}
|
||||
|
||||
// Check for prefixes (case-insensitive)
|
||||
lower := strings.ToLower(query)
|
||||
if strings.HasPrefix(lower, "a:") {
|
||||
return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
if strings.HasPrefix(lower, "d:") {
|
||||
return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
if strings.HasPrefix(lower, "f:") {
|
||||
return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
if strings.HasPrefix(lower, "t:") {
|
||||
return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
if strings.HasPrefix(lower, "s:") {
|
||||
return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
if strings.HasPrefix(lower, "i:") {
|
||||
return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])}
|
||||
}
|
||||
|
||||
// For "all" type, check if pattern looks like a domain and extract host/tld
|
||||
result := SearchQuery{Type: "all", Pattern: query}
|
||||
if looksLikeDomain(query) {
|
||||
host, tld := parseSearchTerm(query)
|
||||
if tld != "" {
|
||||
result.DomainHost = host
|
||||
result.DomainTLD = tld
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// looksLikeDomain checks if a query looks like a domain name
|
||||
func looksLikeDomain(query string) bool {
|
||||
if query == "" || strings.Contains(query, " ") {
|
||||
return false
|
||||
}
|
||||
// Must have at least one dot
|
||||
lastDot := strings.LastIndex(query, ".")
|
||||
if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 {
|
||||
return false
|
||||
}
|
||||
// TLD must be 2-6 lowercase letters
|
||||
tld := query[lastDot+1:]
|
||||
if len(tld) < 2 || len(tld) > 6 {
|
||||
return false
|
||||
}
|
||||
for _, c := range tld {
|
||||
if c < 'a' || c > 'z' {
|
||||
if c < 'A' || c > 'Z' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// parseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter.
|
||||
// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them.
|
||||
// Returns (hostPattern, tldFilter) where tldFilter may be empty.
|
||||
func parseSearchTerm(search string) (hostPattern, tldFilter string) {
|
||||
search = strings.TrimSpace(search)
|
||||
if search == "" {
|
||||
return "", ""
|
||||
}
|
||||
|
||||
// Check if search contains a dot
|
||||
lastDot := strings.LastIndex(search, ".")
|
||||
if lastDot == -1 || lastDot == len(search)-1 {
|
||||
// No dot or ends with dot - treat as host-only search
|
||||
return search, ""
|
||||
}
|
||||
|
||||
// Extract potential TLD (part after last dot)
|
||||
potentialTLD := strings.ToLower(search[lastDot+1:])
|
||||
hostPart := search[:lastDot]
|
||||
|
||||
// Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs)
|
||||
if len(potentialTLD) < 2 || len(potentialTLD) > 24 {
|
||||
return search, ""
|
||||
}
|
||||
for _, c := range potentialTLD {
|
||||
if c < 'a' || c > 'z' {
|
||||
// Contains non-letter, not a TLD
|
||||
return search, ""
|
||||
}
|
||||
}
|
||||
|
||||
// Looks like a valid TLD pattern
|
||||
return hostPart, potentialTLD
|
||||
}
|
||||
|
||||
// shouldCrawl checks if a link should be crawled (same host as base)
|
||||
func shouldCrawl(link, baseURL string) bool {
|
||||
linkURL, err := url.Parse(link)
|
||||
|
||||
Reference in New Issue
Block a user