gosint-sitecrawl/internal/sitemap/sitemap.go

111 lines
2.8 KiB
Go

package sitemap
import (
"compress/gzip"
"context"
"encoding/xml"
"errors"
"io"
"net/http"
"strings"
"urlcrawler/internal/urlutil"
)
var ErrNotFound = errors.New("sitemap not found")
// FetchAll attempts to fetch /sitemap.xml and /sitemap_index.xml, parse URLs, and follow indexes.
func FetchAll(ctx context.Context, target string, client *http.Client, userAgent string) (map[string]struct{}, error) {
origin := urlutil.Origin(target)
candidates := []string{origin + "/sitemap.xml", origin + "/sitemap_index.xml"}
found := make(map[string]struct{})
var any bool
for _, u := range candidates {
urls, err := fetchOne(ctx, u, client, userAgent)
if err == nil && len(urls) > 0 {
any = true
for v := range urls {
found[v] = struct{}{}
}
}
}
if !any {
return found, ErrNotFound
}
return found, nil
}
func fetchOne(ctx context.Context, u string, client *http.Client, userAgent string) (map[string]struct{}, error) {
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errors.New(resp.Status)
}
var r io.Reader = resp.Body
if strings.HasSuffix(strings.ToLower(u), ".gz") || strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "gzip") {
gz, err := gzip.NewReader(resp.Body)
if err == nil {
r = gz
defer gz.Close()
}
}
data, _ := io.ReadAll(r)
ct := strings.ToLower(resp.Header.Get("Content-Type"))
if strings.Contains(ct, "/xml") || strings.Contains(ct, "+xml") || strings.HasSuffix(strings.ToLower(u), ".xml") || true {
return parseSitemapXML(ctx, client, userAgent, data)
}
return nil, errors.New("unsupported sitemap content-type")
}
func parseSitemapXML(ctx context.Context, client *http.Client, userAgent string, data []byte) (map[string]struct{}, error) {
type urlEntry struct {
Loc string `xml:"loc"`
}
type urlSet struct {
URLs []urlEntry `xml:"url"`
}
type indexEntry struct {
Loc string `xml:"loc"`
}
type siteIndex struct {
Sitemaps []indexEntry `xml:"sitemap"`
}
found := make(map[string]struct{})
// First try urlset
var us urlSet
if err := xml.Unmarshal(data, &us); err == nil && len(us.URLs) > 0 {
for _, e := range us.URLs {
loc := strings.TrimSpace(e.Loc)
if loc != "" {
found[loc] = struct{}{}
}
}
return found, nil
}
// Then try index
var si siteIndex
if err := xml.Unmarshal(data, &si); err == nil && len(si.Sitemaps) > 0 {
for _, e := range si.Sitemaps {
loc := strings.TrimSpace(e.Loc)
if loc == "" {
continue
}
child, err := fetchOne(ctx, loc, client, userAgent)
if err == nil {
for v := range child {
found[v] = struct{}{}
}
}
}
return found, nil
}
return found, errors.New("unrecognized sitemap XML")
}