gosint-sitecrawl/internal/report/report.go

166 lines
4.6 KiB
Go

package report
import (
"fmt"
"io"
"sort"
"urlcrawler/internal/linkcheck"
)
type Report struct {
Target string `json:"target"`
CrawledURLs []string `json:"crawledUrls"`
SitemapURLs []string `json:"sitemapUrls,omitempty"`
CrawlErrors map[string]string `json:"crawlErrors,omitempty"`
LinkStatuses []linkcheck.LinkStatus `json:"linkStatuses"`
PageOutlinks map[string][]string `json:"pageOutlinks"`
LinkSources map[string][]string `json:"linkSources"`
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
Metadata Metadata `json:"metadata"`
Params Params `json:"params"`
Stats Stats `json:"stats"`
}
type Metadata struct {
StartedAt string `json:"startedAt"` // RFC3339
FinishedAt string `json:"finishedAt"` // RFC3339
DurationMs int64 `json:"durationMs"`
}
type Params struct {
MaxDepth int `json:"maxDepth"`
Concurrency int `json:"concurrency"`
TimeoutMs int64 `json:"timeoutMs"`
UserAgent string `json:"userAgent"`
SameHostOnly bool `json:"sameHostOnly"`
}
type Stats struct {
OK int `json:"ok"`
Broken int `json:"broken"`
Status2xx int `json:"status2xx"`
Status3xx int `json:"status3xx"`
Status4xx int `json:"status4xx"`
Status5xx int `json:"status5xx"`
StatusOther int `json:"statusOther"`
}
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap))
crawlErrMap := make(map[string]string, len(crawlErrs))
for k, v := range crawlErrs {
crawlErrMap[k] = v.Error()
}
missing := difference(crawled, sitemap)
missingList := sanitizeURLs(keys(missing))
inSmNotCrawled := difference(sitemap, crawled)
inSmNotCrawledList := sanitizeURLs(keys(inSmNotCrawled))
pageOut := make(map[string][]string, len(outlinks))
linkSrc := make(map[string][]string)
for page, set := range outlinks {
lst := sanitizeURLs(keys(set))
pageOut[page] = lst
for _, u := range lst {
linkSrc[u] = append(linkSrc[u], page)
}
}
// Compute simple status histogram
var st Stats
for _, ls := range check.Statuses {
if ls.OK {
st.OK++
} else {
st.Broken++
}
switch {
case ls.StatusCode >= 200 && ls.StatusCode < 300:
st.Status2xx++
case ls.StatusCode >= 300 && ls.StatusCode < 400:
st.Status3xx++
case ls.StatusCode >= 400 && ls.StatusCode < 500:
st.Status4xx++
case ls.StatusCode >= 500 && ls.StatusCode < 600:
st.Status5xx++
default:
st.StatusOther++
}
}
return Report{
Target: target,
CrawledURLs: crawledList,
SitemapURLs: sitemapList,
CrawlErrors: crawlErrMap,
LinkStatuses: check.Statuses,
PageOutlinks: pageOut,
LinkSources: linkSrc,
MissingInSitemap: missingList,
InSitemapNotCrawled: inSmNotCrawledList,
Metadata: meta,
Params: params,
Stats: st,
}
}
func PrintText(w io.Writer, r Report) {
fmt.Fprintf(w, "Target: %s\n\n", r.Target)
fmt.Fprintf(w, "Crawled URLs: %d\n", len(r.CrawledURLs))
fmt.Fprintf(w, "Sitemap URLs: %d\n", len(r.SitemapURLs))
fmt.Fprintf(w, "Links checked: %d\n", len(r.LinkStatuses))
fmt.Fprintf(w, "Missing in sitemap: %d\n", len(r.MissingInSitemap))
fmt.Fprintf(w, "In sitemap not crawled: %d\n\n", len(r.InSitemapNotCrawled))
// Keep text output concise; details available in JSON
}
func keys[T comparable](m map[T]struct{}) []T {
res := make([]T, 0, len(m))
for k := range m {
res = append(res, k)
}
sort.Slice(res, func(i, j int) bool { return asString(res[i]) < asString(res[j]) })
return res
}
func asString[T any](v T) string {
switch x := any(v).(type) {
case string:
return x
default:
return fmt.Sprintf("%v", v)
}
}
func difference(a, b map[string]struct{}) map[string]struct{} {
res := make(map[string]struct{})
for k := range a {
if _, ok := b[k]; !ok {
res[k] = struct{}{}
}
}
return res
}
// sanitizeURLs normalizes small variants like trailing "/." to "/" for consistency.
func sanitizeURLs(urls []string) []string {
out := make([]string, 0, len(urls))
for _, u := range urls {
out = append(out, sanitizeURL(u))
}
sort.Strings(out)
return out
}
func sanitizeURL(u string) string {
if len(u) >= 2 && u[len(u)-2:] == "/." {
return u[:len(u)-1]
}
return u
}