111 lines
2.8 KiB
Go
111 lines
2.8 KiB
Go
package sitemap
|
|
|
|
import (
|
|
"compress/gzip"
|
|
"context"
|
|
"encoding/xml"
|
|
"errors"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"urlcrawler/internal/urlutil"
|
|
)
|
|
|
|
var ErrNotFound = errors.New("sitemap not found")
|
|
|
|
// FetchAll attempts to fetch /sitemap.xml and /sitemap_index.xml, parse URLs, and follow indexes.
|
|
func FetchAll(ctx context.Context, target string, client *http.Client, userAgent string) (map[string]struct{}, error) {
|
|
origin := urlutil.Origin(target)
|
|
candidates := []string{origin + "/sitemap.xml", origin + "/sitemap_index.xml"}
|
|
found := make(map[string]struct{})
|
|
var any bool
|
|
for _, u := range candidates {
|
|
urls, err := fetchOne(ctx, u, client, userAgent)
|
|
if err == nil && len(urls) > 0 {
|
|
any = true
|
|
for v := range urls {
|
|
found[v] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
if !any {
|
|
return found, ErrNotFound
|
|
}
|
|
return found, nil
|
|
}
|
|
|
|
func fetchOne(ctx context.Context, u string, client *http.Client, userAgent string) (map[string]struct{}, error) {
|
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
|
req.Header.Set("User-Agent", userAgent)
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, errors.New(resp.Status)
|
|
}
|
|
var r io.Reader = resp.Body
|
|
if strings.HasSuffix(strings.ToLower(u), ".gz") || strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "gzip") {
|
|
gz, err := gzip.NewReader(resp.Body)
|
|
if err == nil {
|
|
r = gz
|
|
defer gz.Close()
|
|
}
|
|
}
|
|
data, _ := io.ReadAll(r)
|
|
ct := strings.ToLower(resp.Header.Get("Content-Type"))
|
|
if strings.Contains(ct, "/xml") || strings.Contains(ct, "+xml") || strings.HasSuffix(strings.ToLower(u), ".xml") || true {
|
|
return parseSitemapXML(ctx, client, userAgent, data)
|
|
}
|
|
return nil, errors.New("unsupported sitemap content-type")
|
|
}
|
|
|
|
func parseSitemapXML(ctx context.Context, client *http.Client, userAgent string, data []byte) (map[string]struct{}, error) {
|
|
type urlEntry struct {
|
|
Loc string `xml:"loc"`
|
|
}
|
|
type urlSet struct {
|
|
URLs []urlEntry `xml:"url"`
|
|
}
|
|
type indexEntry struct {
|
|
Loc string `xml:"loc"`
|
|
}
|
|
type siteIndex struct {
|
|
Sitemaps []indexEntry `xml:"sitemap"`
|
|
}
|
|
|
|
found := make(map[string]struct{})
|
|
|
|
// First try urlset
|
|
var us urlSet
|
|
if err := xml.Unmarshal(data, &us); err == nil && len(us.URLs) > 0 {
|
|
for _, e := range us.URLs {
|
|
loc := strings.TrimSpace(e.Loc)
|
|
if loc != "" {
|
|
found[loc] = struct{}{}
|
|
}
|
|
}
|
|
return found, nil
|
|
}
|
|
// Then try index
|
|
var si siteIndex
|
|
if err := xml.Unmarshal(data, &si); err == nil && len(si.Sitemaps) > 0 {
|
|
for _, e := range si.Sitemaps {
|
|
loc := strings.TrimSpace(e.Loc)
|
|
if loc == "" {
|
|
continue
|
|
}
|
|
child, err := fetchOne(ctx, loc, client, userAgent)
|
|
if err == nil {
|
|
for v := range child {
|
|
found[v] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
return found, nil
|
|
}
|
|
return found, errors.New("unrecognized sitemap XML")
|
|
}
|