Skip to content

Commit e1bb63e

Browse files
heiskrCopilot
andauthored
Drop Cheerio from renderProp and fastTextOnly (#60547)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 0ec7a69 commit e1bb63e

5 files changed

Lines changed: 121 additions & 9 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { describe, expect, it } from 'vitest'
2+
3+
import { fastTextOnly } from '@/content-render/unified/text-only'
4+
5+
describe('fastTextOnly', () => {
6+
it('returns empty string for falsy input', () => {
7+
expect(fastTextOnly('')).toBe('')
8+
})
9+
10+
it('strips a simple <p> wrapper and decodes entities', () => {
11+
expect(fastTextOnly('<p>Foo &amp; bar</p>')).toBe('Foo & bar')
12+
})
13+
14+
it('strips nested tags', () => {
15+
expect(fastTextOnly('A <a href="#">link</a> and <code>code</code>')).toBe('A link and code')
16+
})
17+
18+
it('handles multiple nested elements', () => {
19+
expect(fastTextOnly('<p>text with <code>code</code> and <em>emphasis</em></p>')).toBe(
20+
'text with code and emphasis',
21+
)
22+
})
23+
24+
it('decodes HTML entities', () => {
25+
expect(fastTextOnly('<p>&lt;script&gt;alert(1)&lt;/script&gt;</p>')).toBe(
26+
'<script>alert(1)</script>',
27+
)
28+
})
29+
30+
it('trims whitespace', () => {
31+
expect(fastTextOnly('<p> hello </p>')).toBe('hello')
32+
})
33+
34+
it('handles self-closing tags', () => {
35+
expect(fastTextOnly('before<br/>after')).toBe('beforeafter')
36+
})
37+
})
Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1-
import { load } from 'cheerio'
21
import { decode } from 'html-entities'
32

3+
// Strip all HTML tags, leaving only text content.
4+
// Handles nested tags like `<p>text with <code>code</code></p>`.
5+
const TAG_RE = /<[^>]+>/g
6+
47
// Given a piece of HTML return it without HTML. E.g.
58
// `<p>Foo &amp; bar</p>` becomes `Foo & bar`
69
// and `A <a href="">link</a> and <code>code</code>` becomes `A link and code`.
7-
// Take advantage of the subtle fact that a lot of the times, the html value
8-
// we get here is a single line that starts with `<p>` and ends with `</p>`
9-
// and contains no longer HTML tags.
10+
//
11+
// This operates on trusted rendered HTML from our own render pipeline,
12+
// not user-supplied input. The output is used for plain-text display only
13+
// (mini-TOC items, search descriptions, etc.).
1014
export function fastTextOnly(html: string): string {
1115
if (!html) return ''
16+
// Fast path: simple `<p>text</p>` with no inner tags
1217
if (html.startsWith('<p>') && html.endsWith('</p>')) {
1318
const middle = html.slice(3, -4)
1419
if (!middle.includes('<')) return decode(middle.trim())
1520
}
16-
const $ = load(html, { xmlMode: true })
17-
return $.root().text().trim()
21+
// Strip all tags and decode entities.
22+
return decode(html.replace(TAG_RE, '').trim()) // lgtm[js/incomplete-multi-character-sanitization]
1823
}

src/frame/lib/page.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import assert from 'assert'
22
import path from 'path'
33
import fs from 'fs/promises'
4-
import { load } from 'cheerio'
4+
import { stripOuterTag } from '@/frame/lib/strip-outer-tag'
55
import getApplicableVersions from '@/versions/lib/get-applicable-versions'
66
import generateRedirectsForPermalinks from '@/redirects/lib/permalinks'
77
import getEnglishHeadings from '@/languages/lib/get-english-headings'
@@ -440,8 +440,7 @@ class Page {
440440
if (!opts.unwrap) return html
441441

442442
// The unwrap option removes surrounding tags from a string, preserving any inner HTML
443-
const $ = load(html, { xmlMode: true })
444-
return $.root().contents().html() || ''
443+
return stripOuterTag(html)
445444
}
446445

447446
// infer current page's corresponding homepage

src/frame/lib/strip-outer-tag.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Remove the outermost tag from an HTML string, preserving inner content.
2+
// E.g. `<p>Hello <strong>world</strong></p>` → `Hello <strong>world</strong>`
3+
// Only unwraps when the string is exactly one element with matching open/close tags.
4+
// Returns the original string unchanged for multiple top-level elements or malformed HTML.
5+
export function stripOuterTag(html: string): string {
6+
if (!html) return ''
7+
8+
// Extract opening tag and tag name
9+
const openMatch = html.match(/^<([a-z][a-z0-9]*)\b[^>]*>/i)
10+
if (!openMatch) return html
11+
12+
const tagName = openMatch[1]
13+
const closeTag = `</${tagName}>`
14+
15+
// Must end with matching close tag
16+
if (html.slice(-closeTag.length).toLowerCase() !== closeTag.toLowerCase()) return html
17+
18+
// Verify single top-level element by checking that same-name tags
19+
// in the inner content are balanced. If depth goes negative, there
20+
// are sibling elements (e.g. `<p>a</p><p>b</p>`).
21+
const inner = html.slice(openMatch[0].length, html.length - closeTag.length)
22+
const tagRe = new RegExp(`<(/?)(${tagName})\\b[^>]*>`, 'gi')
23+
let depth = 0
24+
let m
25+
while ((m = tagRe.exec(inner)) !== null) {
26+
depth += m[1] === '/' ? -1 : 1
27+
if (depth < 0) return html
28+
}
29+
if (depth !== 0) return html
30+
31+
return inner
32+
}

src/frame/tests/strip-outer-tag.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { describe, expect, it } from 'vitest'
2+
3+
import { stripOuterTag } from '@/frame/lib/strip-outer-tag'
4+
5+
describe('stripOuterTag', () => {
6+
it('returns empty string for falsy input', () => {
7+
expect(stripOuterTag('')).toBe('')
8+
})
9+
10+
it('removes a <p> wrapper preserving inner HTML', () => {
11+
expect(stripOuterTag('<p>Hello <strong>world</strong></p>')).toBe(
12+
'Hello <strong>world</strong>',
13+
)
14+
})
15+
16+
it('removes a <div> wrapper', () => {
17+
expect(stripOuterTag('<div class="foo">content</div>')).toBe('content')
18+
})
19+
20+
it('returns original string for multiple top-level elements', () => {
21+
expect(stripOuterTag('<p>a</p><p>b</p>')).toBe('<p>a</p><p>b</p>')
22+
})
23+
24+
it('returns original string for plain text with no tags', () => {
25+
expect(stripOuterTag('just text')).toBe('just text')
26+
})
27+
28+
it('returns original string for malformed HTML', () => {
29+
expect(stripOuterTag('<p>unclosed')).toBe('<p>unclosed')
30+
})
31+
32+
it('returns original string when open and close tags do not match', () => {
33+
expect(stripOuterTag('<p>content</div>')).toBe('<p>content</div>')
34+
})
35+
36+
it('handles nested same-name tags', () => {
37+
expect(stripOuterTag('<div><div>inner</div></div>')).toBe('<div>inner</div>')
38+
})
39+
})

0 commit comments

Comments
 (0)