package sitemap import ( "compress/gzip" "context" "encoding/xml" "errors" "io" "net/http" "strings" "urlcrawler/internal/urlutil" ) var ErrNotFound = errors.New("sitemap not found") // FetchAll attempts to fetch /sitemap.xml and /sitemap_index.xml, parse URLs, and follow indexes. func FetchAll(ctx context.Context, target string, client *http.Client, userAgent string) (map[string]struct{}, error) { origin := urlutil.Origin(target) candidates := []string{origin + "/sitemap.xml", origin + "/sitemap_index.xml"} found := make(map[string]struct{}) var any bool for _, u := range candidates { urls, err := fetchOne(ctx, u, client, userAgent) if err == nil && len(urls) > 0 { any = true for v := range urls { found[v] = struct{}{} } } } if !any { return found, ErrNotFound } return found, nil } func fetchOne(ctx context.Context, u string, client *http.Client, userAgent string) (map[string]struct{}, error) { req, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, errors.New(resp.Status) } var r io.Reader = resp.Body if strings.HasSuffix(strings.ToLower(u), ".gz") || strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "gzip") { gz, err := gzip.NewReader(resp.Body) if err == nil { r = gz defer gz.Close() } } data, _ := io.ReadAll(r) ct := strings.ToLower(resp.Header.Get("Content-Type")) if strings.Contains(ct, "/xml") || strings.Contains(ct, "+xml") || strings.HasSuffix(strings.ToLower(u), ".xml") || true { return parseSitemapXML(ctx, client, userAgent, data) } return nil, errors.New("unsupported sitemap content-type") } func parseSitemapXML(ctx context.Context, client *http.Client, userAgent string, data []byte) (map[string]struct{}, error) { type urlEntry struct { Loc string `xml:"loc"` } type urlSet struct { URLs []urlEntry `xml:"url"` } type indexEntry struct { Loc string `xml:"loc"` } type siteIndex struct { Sitemaps []indexEntry `xml:"sitemap"` } found := make(map[string]struct{}) // First try urlset var us urlSet if err := xml.Unmarshal(data, &us); err == nil && len(us.URLs) > 0 { for _, e := range us.URLs { loc := strings.TrimSpace(e.Loc) if loc != "" { found[loc] = struct{}{} } } return found, nil } // Then try index var si siteIndex if err := xml.Unmarshal(data, &si); err == nil && len(si.Sitemaps) > 0 { for _, e := range si.Sitemaps { loc := strings.TrimSpace(e.Loc) if loc == "" { continue } child, err := fetchOne(ctx, loc, client, userAgent) if err == nil { for v := range child { found[v] = struct{}{} } } } return found, nil } return found, errors.New("unrecognized sitemap XML") }