Improve content extraction with charset handling and flexible selectors · julien.rbrt.fr/glean@be4bec9

+333 -151

2 changed files

Expand all

internal

scraper

scraper.go

scraper_test.go

+324 -142

internal/scraper/scraper.go

··· 8 8 "io" 9 9 "log/slog" 10 10 "net/http" 11 + "slices" 11 12 "strings" 12 13 "time" 13 14 14 15 "pkg.rbrt.fr/glean/internal/httpclient" 15 16 16 17 "golang.org/x/net/html" 18 + "golang.org/x/net/html/charset" 17 19 ) 18 20 19 21 type Scraper struct { ··· 53 55 } 54 56 55 57 func (s *Scraper) scrapeDirect(ctx context.Context, articleURL string) (string, error) { 56 - body, err := s.fetch(ctx, articleURL) 58 + resp, err := s.fetch(ctx, articleURL) 57 59 if err != nil { 58 60 return "", fmt.Errorf("fetching article: %w", err) 59 61 } 60 - return extractContent(body) 62 + return extractContent(resp.Body, resp.ContentType) 61 63 } 62 64 63 65 func (s *Scraper) scrapeArchive(ctx context.Context, articleURL string) (string, error) { 64 - archiveURL := s.archiveURL + articleURL 65 - body, err := s.fetch(ctx, archiveURL) 66 + resp, err := s.fetch(ctx, s.archiveURL+articleURL) 66 67 if err != nil { 67 68 return "", fmt.Errorf("fetching from archive.is: %w", err) 68 69 } 69 - return extractContent(body) 70 + return extractContent(resp.Body, resp.ContentType) 70 71 } 71 72 72 - func (s *Scraper) fetch(ctx context.Context, url string) (io.Reader, error) { 73 - reader, err := s.doFetch(ctx, url) 73 + type fetchResult struct { 74 + Body io.Reader 75 + ContentType string 76 + } 77 + 78 + func (s *Scraper) fetch(ctx context.Context, url string) (*fetchResult, error) { 79 + resp, err := s.doFetch(ctx, url) 74 80 if err == nil { 75 - return reader, nil 81 + return resp, nil 76 82 } 77 83 78 84 var se *httpclient.StatusError ··· 92 98 return s.doFetch(ctx, url) 93 99 } 94 100 95 - func (s *Scraper) doFetch(ctx context.Context, url string) (io.Reader, error) { 101 + func (s *Scraper) doFetch(ctx context.Context, url string) (*fetchResult, error) { 96 102 req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 97 103 if err != nil { 98 104 return nil, err ··· 112 118 if err != nil { 113 119 return nil, fmt.Errorf("reading body: %w", err) 114 120 } 115 - return bytes.NewReader(data), nil 121 + return &fetchResult{ 122 + Body: bytes.NewReader(data), 123 + ContentType: resp.Header.Get("Content-Type"), 124 + }, nil 116 125 } 117 126 118 127 se := &httpclient.StatusError{StatusCode: resp.StatusCode} ··· 122 131 return nil, se 123 132 } 124 133 125 - func extractContent(r io.Reader) (string, error) { 126 - doc, err := html.Parse(r) 134 + func extractContent(r io.Reader, contentType string) (string, error) { 135 + doc, err := parseHTML(r, contentType) 127 136 if err != nil { 128 137 return "", fmt.Errorf("parsing HTML: %w", err) 129 138 } 130 139 131 140 removeUnwanted(doc) 132 141 133 - if content := findElement(doc, "article"); content != nil { 142 + if content := findArticle(doc); content != nil { 134 143 return renderNode(content), nil 135 144 } 136 145 137 - if content := findElementByRole(doc, "main"); content != nil { 146 + if content := findLargestTextNode(doc); content != nil { 138 147 return renderNode(content), nil 139 148 } 140 149 141 - if content := findElement(doc, "main"); content != nil { 142 - return renderNode(content), nil 150 + return "", nil 151 + } 152 + 153 + func parseHTML(r io.Reader, contentType string) (*html.Node, error) { 154 + ct := contentType 155 + if ct == "" { 156 + ct = "text/html; charset=utf-8" 157 + } else if !strings.Contains(ct, "charset") { 158 + ct += "; charset=utf-8" 159 + } 160 + 161 + decoded, err := charset.NewReader(r, ct) 162 + if err != nil { 163 + return nil, fmt.Errorf("decoding body: %w", err) 164 + } 165 + 166 + return html.Parse(decoded) 167 + } 168 + 169 + func findArticle(doc *html.Node) *html.Node { 170 + for _, strategy := range articleFindStrategies { 171 + if node := strategy(doc); node != nil { 172 + return node 173 + } 143 174 } 175 + return nil 176 + } 144 177 145 - if content := findLargestTextNode(doc); content != nil { 146 - return renderNode(content), nil 178 + var articleFindStrategies = []func(*html.Node) *html.Node{ 179 + func(n *html.Node) *html.Node { return findByTag(n, "article") }, 180 + func(n *html.Node) *html.Node { return findByRole(n, "main") }, 181 + func(n *html.Node) *html.Node { return findByTag(n, "main") }, 182 + func(n *html.Node) *html.Node { return findByAttr(n, "itemprop", "articleBody") }, 183 + func(n *html.Node) *html.Node { return findByContentClass(n) }, 184 + } 185 + 186 + func findByContentClass(root *html.Node) *html.Node { 187 + var best *html.Node 188 + bestLen := 0 189 + 190 + forEachElement(root, func(n *html.Node) { 191 + for _, attr := range n.Attr { 192 + if attr.Key != "class" && attr.Key != "id" { 193 + continue 194 + } 195 + val := strings.ToLower(attr.Val) 196 + if !matchesContentPattern(val) { 197 + continue 198 + } 199 + tl := textLength(n) 200 + if tl > bestLen && tl >= minContentLength { 201 + best = n 202 + bestLen = tl 203 + } 204 + } 205 + }) 206 + return best 207 + } 208 + 209 + var contentPatterns = []string{ 210 + "post-content", "entry-content", "article-content", 211 + "article-body", "article__body", "article__content", 212 + "story-body", "story-content", "content-body", 213 + "post-body", "post-entry", "entry-body", 214 + "blog-content", "blog-post", "wp-content", 215 + "main-content", "page-content", "body-content", 216 + } 217 + 218 + func matchesContentPattern(val string) bool { 219 + for _, p := range contentPatterns { 220 + if strings.Contains(val, p) { 221 + return true 222 + } 147 223 } 224 + return false 225 + } 148 226 149 - return "", nil 227 + var unwantedTags = map[string]bool{ 228 + "script": true, "style": true, "nav": true, "header": true, "footer": true, 229 + "aside": true, "noscript": true, "iframe": true, "form": true, "svg": true, 230 + "button": true, "input": true, "textarea": true, "select": true, 231 + } 232 + 233 + var unwantedRoles = map[string]bool{ 234 + "complementary": true, "banner": true, "contentinfo": true, "navigation": true, 235 + } 236 + 237 + var unwantedClassPatterns = []string{ 238 + "comment", "sidebar", "advertisement", "ad-banner", 239 + "social-share", "share-button", "newsletter", "popup", 240 + "cookie", "paywall", "related-post", "related-article", 241 + "taboola", "outbrain", "disqus", 242 + } 243 + 244 + var unwantedIDPatterns = []string{ 245 + "comment", "sidebar", "footer", "header", "nav", "disqus", 150 246 } 151 247 152 248 func removeUnwanted(n *html.Node) { ··· 167 263 if n.Type != html.ElementNode { 168 264 return false 169 265 } 170 - switch n.Data { 171 - case "script", "style", "nav", "header", "footer", "aside", 172 - "noscript", "iframe", "form", "svg", "button", "input", 173 - "textarea", "select": 266 + if unwantedTags[n.Data] { 174 267 return true 175 268 } 176 269 for _, attr := range n.Attr { 177 - if attr.Key == "class" { 178 - cls := strings.ToLower(attr.Val) 179 - if strings.Contains(cls, "comment") || 180 - strings.Contains(cls, "sidebar") || 181 - strings.Contains(cls, "advertisement") || 182 - strings.Contains(cls, "ad-banner") || 183 - strings.Contains(cls, "social-share") || 184 - strings.Contains(cls, "newsletter") || 185 - strings.Contains(cls, "popup") || 186 - strings.Contains(cls, "cookie") || 187 - strings.Contains(cls, "paywall") { 270 + switch attr.Key { 271 + case "class": 272 + if containsAny(strings.ToLower(attr.Val), unwantedClassPatterns) { 273 + return true 274 + } 275 + case "id": 276 + if containsAny(strings.ToLower(attr.Val), unwantedIDPatterns) { 188 277 return true 189 278 } 190 - } 191 - if attr.Key == "id" { 192 - id := strings.ToLower(attr.Val) 193 - if strings.Contains(id, "comment") || 194 - strings.Contains(id, "sidebar") || 195 - strings.Contains(id, "footer") || 196 - strings.Contains(id, "header") || 197 - strings.Contains(id, "nav") { 279 + case "role": 280 + if unwantedRoles[strings.ToLower(attr.Val)] { 198 281 return true 199 282 } 200 283 } ··· 202 285 return false 203 286 } 204 287 205 - func findElement(n *html.Node, tag string) *html.Node { 288 + func containsAny(s string, patterns []string) bool { 289 + return slices.ContainsFunc(patterns, func(p string) bool { 290 + return strings.Contains(s, p) 291 + }) 292 + } 293 + 294 + const minContentLength = 200 295 + 296 + func findLargestTextNode(root *html.Node) *html.Node { 297 + var best *html.Node 298 + bestLen := 0 299 + 300 + forEachElement(root, func(n *html.Node) { 301 + if n.Data != "div" && n.Data != "section" && n.Data != "td" { 302 + return 303 + } 304 + tl := textLength(n) 305 + if tl > bestLen && tl >= minContentLength { 306 + best = n 307 + bestLen = tl 308 + } 309 + }) 310 + return best 311 + } 312 + 313 + func textLength(n *html.Node) int { 314 + total := 0 315 + forEachText(n, func(t string) { 316 + total += len(strings.TrimSpace(t)) 317 + }) 318 + return total 319 + } 320 + 321 + func forEachElement(n *html.Node, fn func(*html.Node)) { 322 + if n.Type == html.ElementNode { 323 + fn(n) 324 + } 325 + for c := n.FirstChild; c != nil; c = c.NextSibling { 326 + forEachElement(c, fn) 327 + } 328 + } 329 + 330 + func forEachText(n *html.Node, fn func(string)) { 331 + if n.Type == html.TextNode { 332 + fn(n.Data) 333 + return 334 + } 335 + for c := n.FirstChild; c != nil; c = c.NextSibling { 336 + forEachText(c, fn) 337 + } 338 + } 339 + 340 + func findByTag(n *html.Node, tag string) *html.Node { 206 341 if n.Type == html.ElementNode && n.Data == tag { 207 342 return n 208 343 } 209 344 for c := n.FirstChild; c != nil; c = c.NextSibling { 210 - if found := findElement(c, tag); found != nil { 345 + if found := findByTag(c, tag); found != nil { 211 346 return found 212 347 } 213 348 } 214 349 return nil 215 350 } 216 351 217 - func findElementByRole(n *html.Node, role string) *html.Node { 352 + func findByRole(n *html.Node, role string) *html.Node { 353 + return findByAttr(n, "role", role) 354 + } 355 + 356 + func findByAttr(n *html.Node, key, val string) *html.Node { 218 357 if n.Type == html.ElementNode { 219 358 for _, attr := range n.Attr { 220 - if attr.Key == "role" && attr.Val == role { 359 + if attr.Key == key && attr.Val == val { 221 360 return n 222 361 } 223 362 } 224 363 } 225 364 for c := n.FirstChild; c != nil; c = c.NextSibling { 226 - if found := findElementByRole(c, role); found != nil { 365 + if found := findByAttr(c, key, val); found != nil { 227 366 return found 228 367 } 229 368 } 230 369 return nil 231 370 } 232 371 233 - func findLargestTextNode(root *html.Node) *html.Node { 234 - var best *html.Node 235 - bestLen := 0 372 + var voidElements = map[string]bool{ 373 + "br": true, "hr": true, "img": true, "input": true, "meta": true, 374 + "link": true, "area": true, "base": true, "col": true, "embed": true, 375 + "source": true, "track": true, "wbr": true, 376 + } 377 + 378 + func renderNode(n *html.Node) string { 379 + var buf strings.Builder 380 + renderChildren(&buf, n) 381 + return strings.TrimSpace(buf.String()) 382 + } 383 + 384 + func renderChildren(buf *strings.Builder, n *html.Node) { 385 + for c := n.FirstChild; c != nil; c = c.NextSibling { 386 + renderNodeInto(buf, c) 387 + } 388 + } 389 + 390 + func renderNodeInto(buf *strings.Builder, node *html.Node) { 391 + if node.Type == html.TextNode { 392 + buf.WriteString(node.Data) 393 + return 394 + } 395 + if node.Type != html.ElementNode { 396 + return 397 + } 398 + 399 + tag := node.Data 400 + 401 + if tag == "a" { 402 + renderLink(buf, node) 403 + return 404 + } 405 + if tag == "img" { 406 + renderImage(buf, node) 407 + return 408 + } 409 + if tag == "source" && !isMediaSource(node) { 410 + return 411 + } 412 + 413 + writeOpenTag(buf, node, filterTagAttrs(tag)) 414 + renderChildren(buf, node) 415 + if !voidElements[tag] { 416 + buf.WriteString("</") 417 + buf.WriteString(tag) 418 + buf.WriteString(">") 419 + } 420 + } 421 + 422 + func renderLink(buf *strings.Builder, node *html.Node) { 423 + href := getAttr(node, "href") 424 + href = strings.TrimSpace(href) 425 + 426 + if isHTTPURL(href) { 427 + writeOpenTag(buf, node, filterTagAttrs("a")) 428 + renderChildren(buf, node) 429 + buf.WriteString("</a>") 430 + return 431 + } 236 432 237 - var walk func(*html.Node) 238 - walk = func(n *html.Node) { 239 - if n.Type == html.ElementNode { 240 - switch n.Data { 241 - case "div", "section", "td": 242 - textLen := textLength(n) 243 - if textLen > bestLen && textLen > 200 { 244 - best = n 245 - bestLen = textLen 246 - } 247 - } 248 - } 249 - for c := n.FirstChild; c != nil; c = c.NextSibling { 250 - walk(c) 251 - } 433 + renderChildren(buf, node) 434 + } 435 + 436 + func renderImage(buf *strings.Builder, node *html.Node) { 437 + src := resolveImgSrc(node) 438 + if src == "" { 439 + return 252 440 } 253 - walk(root) 254 - return best 441 + 442 + buf.WriteString("<img") 443 + writeFilteredAttrs(buf, node, filterTagAttrs("img")) 444 + buf.WriteString(` src="`) 445 + buf.WriteString(html.EscapeString(src)) 446 + buf.WriteString(`"`) 447 + 448 + if getAttr(node, "alt") == "" { 449 + buf.WriteString(` alt=""`) 450 + } 451 + if !hasDimensions(node) { 452 + buf.WriteString(` loading="lazy"`) 453 + } 454 + buf.WriteString(">") 255 455 } 256 456 257 - func textLength(n *html.Node) int { 258 - total := 0 259 - var walk func(*html.Node) 260 - walk = func(c *html.Node) { 261 - if c.Type == html.TextNode { 262 - total += len(strings.TrimSpace(c.Data)) 263 - } 264 - for child := c.FirstChild; child != nil; child = child.NextSibling { 265 - walk(child) 457 + func resolveImgSrc(node *html.Node) string { 458 + for _, key := range []string{"src", "data-src", "data-lazy-src"} { 459 + v := getAttr(node, key) 460 + if v != "" && isReachableURL(v) { 461 + return v 266 462 } 267 463 } 268 - walk(n) 269 - return total 464 + return "" 465 + } 466 + 467 + func isReachableURL(s string) bool { 468 + return isHTTPURL(s) || strings.HasPrefix(s, "//") 270 469 } 271 470 272 - func renderNode(n *html.Node) string { 273 - var buf strings.Builder 274 - var write func(*html.Node) 275 - write = func(node *html.Node) { 276 - if node.Type == html.TextNode { 277 - buf.WriteString(node.Data) 278 - return 279 - } 280 - if node.Type == html.ElementNode { 281 - if node.Data == "a" && isDeadLink(node) { 282 - for c := node.FirstChild; c != nil; c = c.NextSibling { 283 - write(c) 284 - } 285 - return 286 - } 287 - if node.Data == "img" && isDeadImage(node) { 288 - return 289 - } 290 - buf.WriteString("<") 291 - buf.WriteString(node.Data) 292 - for _, attr := range node.Attr { 293 - buf.WriteString(" ") 294 - buf.WriteString(attr.Key) 295 - buf.WriteString(`="`) 296 - buf.WriteString(html.EscapeString(attr.Val)) 297 - buf.WriteString(`"`) 298 - } 299 - buf.WriteString(">") 300 - } 301 - for c := node.FirstChild; c != nil; c = c.NextSibling { 302 - write(c) 303 - } 304 - if node.Type == html.ElementNode && !isVoidElement(node.Data) { 305 - buf.WriteString("</") 306 - buf.WriteString(node.Data) 307 - buf.WriteString(">") 471 + func isHTTPURL(s string) bool { 472 + return strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") 473 + } 474 + 475 + func filterTagAttrs(tag string) func(string) bool { 476 + if tag == "img" { 477 + return func(key string) bool { 478 + return key == "alt" || key == "width" || key == "height" || key == "class" 308 479 } 309 480 } 310 - for c := n.FirstChild; c != nil; c = c.NextSibling { 311 - write(c) 312 - } 313 - return strings.TrimSpace(buf.String()) 481 + return defaultAttrFilter 314 482 } 315 483 316 - func isVoidElement(tag string) bool { 317 - switch tag { 318 - case "br", "hr", "img", "input", "meta", "link", "area", 319 - "base", "col", "embed", "source", "track", "wbr": 320 - return true 484 + var defaultAttrFilter = func() func(string) bool { 485 + allowed := map[string]bool{ 486 + "href": true, "src": true, "alt": true, "title": true, 487 + "class": true, "id": true, "width": true, "height": true, 488 + "type": true, "controls": true, "preload": true, "poster": true, 489 + "cite": true, "datetime": true, "colspan": true, "rowspan": true, 490 + "loading": true, "decoding": true, "itemprop": true, 491 + "role": true, "aria-label": true, "aria-hidden": true, 321 492 } 322 - return false 493 + return func(key string) bool { return allowed[key] } 494 + }() 495 + 496 + func writeOpenTag(buf *strings.Builder, node *html.Node, allow func(string) bool) { 497 + buf.WriteString("<") 498 + buf.WriteString(node.Data) 499 + writeFilteredAttrs(buf, node, allow) 500 + buf.WriteString(">") 323 501 } 324 502 325 - func isDeadLink(n *html.Node) bool { 326 - for _, attr := range n.Attr { 327 - if attr.Key != "href" { 503 + func writeFilteredAttrs(buf *strings.Builder, node *html.Node, allow func(string) bool) { 504 + for _, attr := range node.Attr { 505 + if !allow(attr.Key) { 328 506 continue 329 507 } 330 - href := strings.TrimSpace(attr.Val) 331 - if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { 332 - return false 333 - } 334 - return true 508 + buf.WriteString(" ") 509 + buf.WriteString(attr.Key) 510 + buf.WriteString(`="`) 511 + buf.WriteString(html.EscapeString(attr.Val)) 512 + buf.WriteString(`"`) 335 513 } 336 - return true 514 + } 515 + 516 + func hasDimensions(node *html.Node) bool { 517 + w, h := getAttr(node, "width"), getAttr(node, "height") 518 + return w != "" && w != "0" && h != "" && h != "0" 519 + } 520 + 521 + func isMediaSource(node *html.Node) bool { 522 + t := getAttr(node, "type") 523 + return strings.HasPrefix(t, "video/") || strings.HasPrefix(t, "audio/") 337 524 } 338 525 339 - func isDeadImage(n *html.Node) bool { 526 + func getAttr(n *html.Node, key string) string { 340 527 for _, attr := range n.Attr { 341 - if attr.Key != "src" { 342 - continue 343 - } 344 - src := strings.TrimSpace(attr.Val) 345 - if strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") { 346 - return false 528 + if attr.Key == key { 529 + return attr.Val 347 530 } 348 - return true 349 531 } 350 - return true 532 + return "" 351 533 }

+9 -9

internal/scraper/scraper_test.go

··· 17 17 <footer>footer</footer> 18 18 </body></html>` 19 19 20 - content, err := extractContent(strings.NewReader(html)) 20 + content, err := extractContent(strings.NewReader(html), "") 21 21 assert.NilError(t, err) 22 22 assert.Assert(t, strings.Contains(content, "This is the article content")) 23 23 assert.Assert(t, !strings.Contains(content, "navigation")) ··· 30 30 <main><p>Main content area with sufficient text to be considered a proper article body for reading purposes.</p></main> 31 31 </body></html>` 32 32 33 - content, err := extractContent(strings.NewReader(html)) 33 + content, err := extractContent(strings.NewReader(html), "") 34 34 assert.NilError(t, err) 35 35 assert.Assert(t, strings.Contains(content, "Main content area")) 36 36 } ··· 40 40 <div role="main"><p>Content in a role=main div with enough text to be useful for the reader to enjoy.</p></div> 41 41 </body></html>` 42 42 43 - content, err := extractContent(strings.NewReader(html)) 43 + content, err := extractContent(strings.NewReader(html), "") 44 44 assert.NilError(t, err) 45 45 assert.Assert(t, strings.Contains(content, "Content in a role=main div")) 46 46 } ··· 53 53 </div> 54 54 </body></html>` 55 55 56 - content, err := extractContent(strings.NewReader(html)) 56 + content, err := extractContent(strings.NewReader(html), "") 57 57 assert.NilError(t, err) 58 58 assert.Assert(t, strings.Contains(content, "Lorem ipsum")) 59 59 } ··· 67 67 </article> 68 68 </body></html>` 69 69 70 - content, err := extractContent(strings.NewReader(html)) 70 + content, err := extractContent(strings.NewReader(html), "") 71 71 assert.NilError(t, err) 72 72 assert.Assert(t, strings.Contains(content, "Good content")) 73 73 assert.Assert(t, !strings.Contains(content, "alert")) ··· 77 77 func TestExtractContent_EmptyBody(t *testing.T) { 78 78 html := `<!DOCTYPE html><html><body></body></html>` 79 79 80 - content, err := extractContent(strings.NewReader(html)) 80 + content, err := extractContent(strings.NewReader(html), "") 81 81 assert.NilError(t, err) 82 82 assert.Equal(t, content, "") 83 83 } ··· 136 136 </article> 137 137 </body></html>` 138 138 139 - content, err := extractContent(strings.NewReader(html)) 139 + content, err := extractContent(strings.NewReader(html), "") 140 140 assert.NilError(t, err) 141 141 assert.Assert(t, strings.Contains(content, "<br>")) 142 142 assert.Assert(t, strings.Contains(content, "<img")) ··· 150 150 </article> 151 151 </body></html>` 152 152 153 - content, err := extractContent(strings.NewReader(html)) 153 + content, err := extractContent(strings.NewReader(html), "") 154 154 assert.NilError(t, err) 155 155 assert.Assert(t, strings.Contains(content, "home")) 156 156 assert.Assert(t, strings.Contains(content, "about")) ··· 176 176 </article> 177 177 </body></html>` 178 178 179 - content, err := extractContent(strings.NewReader(html)) 179 + content, err := extractContent(strings.NewReader(html), "") 180 180 assert.NilError(t, err) 181 181 assert.Assert(t, !strings.Contains(content, `/images/photo.jpg`)) 182 182 assert.Assert(t, !strings.Contains(content, `data:image`))

Configure Feed

Configure Feed