gosint-sitecrawl/internal/urlutil/urlutil.go

63 lines
1.3 KiB
Go

package urlutil
import (
"net/url"
"path"
"strings"
)
// Normalize resolves href against base, strips fragments, and cleans path.
func Normalize(baseURL string, href string) (string, bool) {
if href == "" {
return "", false
}
if strings.HasPrefix(href, "javascript:") || strings.HasPrefix(href, "mailto:") || strings.HasPrefix(href, "tel:") {
return "", false
}
b, err := url.Parse(baseURL)
if err != nil {
return "", false
}
u, err := url.Parse(href)
if err != nil {
return "", false
}
// Resolve relative links.
u = b.ResolveReference(u)
// Only http/https
if u.Scheme != "http" && u.Scheme != "https" {
return "", false
}
// Drop fragments
u.Fragment = ""
// Clean path
u.Path = path.Clean(u.Path)
return u.String(), true
}
// SameHost returns true if url shares the same host (including port) as base.
func SameHost(baseURL string, candidate string) bool {
b, err := url.Parse(baseURL)
if err != nil {
return false
}
u, err := url.Parse(candidate)
if err != nil {
return false
}
return strings.EqualFold(b.Host, u.Host)
}
// Origin returns scheme://host of a URL.
func Origin(raw string) string {
u, err := url.Parse(raw)
if err != nil {
return raw
}
u.Path = ""
u.RawQuery = ""
u.Fragment = ""
return u.Scheme + "://" + u.Host
}