Skip to content

Commit 8a4d9bd

Browse files
committed
Strip html tags from processed markdown
1 parent ea323d0 commit 8a4d9bd

2 files changed

Lines changed: 322 additions & 0 deletions

File tree

src/__tests__/process-markdown.test.ts

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,183 @@ describe('processMarkdown', () => {
274274
assert.strictEqual(result, input);
275275
});
276276

277+
test('converts img tags to markdown images', async () => {
278+
const input = dedent`
279+
Some text.
280+
281+
<img src="/assets/blog/devtools.png" style={{ width: '425px' }} />
282+
283+
More text.
284+
`;
285+
286+
const { content: result } = await processMarkdown(input);
287+
288+
assert.ok(!result.includes('<img'));
289+
assert.ok(result.includes('![](/assets/blog/devtools.png)'));
290+
assert.ok(result.includes('Some text.'));
291+
assert.ok(result.includes('More text.'));
292+
});
293+
294+
test('converts img tags with alt text', async () => {
295+
const input = `<img src="/assets/screenshot.png" alt="App screenshot" />`;
296+
297+
const { content: result } = await processMarkdown(input);
298+
299+
assert.strictEqual(result, '![App screenshot](/assets/screenshot.png)');
300+
});
301+
302+
test('converts video tags to source URLs', async () => {
303+
const input = dedent`
304+
Some text.
305+
306+
<video playsInline autoPlay muted loop style={{ width: '400px', aspectRatio: 4 / 5 }}>
307+
<source src="/assets/icons/sf-symbol.mp4" />
308+
</video>
309+
310+
More text.
311+
`;
312+
313+
const { content: result } = await processMarkdown(input);
314+
315+
assert.ok(!result.includes('<video'));
316+
assert.ok(!result.includes('<source'));
317+
assert.ok(result.includes('/assets/icons/sf-symbol.mp4'));
318+
assert.ok(result.includes('Some text.'));
319+
assert.ok(result.includes('More text.'));
320+
});
321+
322+
test('converts single-line video tags', async () => {
323+
const input = `<video playsInline autoPlay muted loop><source src="/assets/demo.mp4" /></video>`;
324+
325+
const { content: result } = await processMarkdown(input);
326+
327+
assert.strictEqual(result, '/assets/demo.mp4');
328+
});
329+
330+
test('strips device-frame wrapper divs', async () => {
331+
const input = dedent`
332+
<div className="device-frame">
333+
334+
![Header button](/assets/fundamentals/header-button.png)
335+
336+
</div>
337+
`;
338+
339+
const { content: result } = await processMarkdown(input);
340+
341+
assert.ok(!result.includes('<div'));
342+
assert.ok(!result.includes('</div>'));
343+
assert.ok(!result.includes('device-frame'));
344+
assert.ok(
345+
result.includes(
346+
'![Header button](/assets/fundamentals/header-button.png)'
347+
)
348+
);
349+
});
350+
351+
test('strips image-grid wrapper divs with style', async () => {
352+
const input = dedent`
353+
<div className="image-grid" style={{ '--img-width': '360px' }}>
354+
355+
![Screenshot 1](/assets/themes/light-1.png)
356+
![Screenshot 2](/assets/themes/dark-1.png)
357+
358+
</div>
359+
`;
360+
361+
const { content: result } = await processMarkdown(input);
362+
363+
assert.ok(!result.includes('<div'));
364+
assert.ok(!result.includes('</div>'));
365+
assert.ok(result.includes('![Screenshot 1](/assets/themes/light-1.png)'));
366+
assert.ok(result.includes('![Screenshot 2](/assets/themes/dark-1.png)'));
367+
});
368+
369+
test('strips device-frame div wrapping a video', async () => {
370+
const input = dedent`
371+
<div className="device-frame">
372+
<video playsInline autoPlay muted loop>
373+
<source src="/assets/fundamentals/navigate.mp4" />
374+
</video>
375+
</div>
376+
`;
377+
378+
const { content: result } = await processMarkdown(input);
379+
380+
assert.ok(!result.includes('<div'));
381+
assert.ok(!result.includes('<video'));
382+
assert.ok(result.includes('/assets/fundamentals/navigate.mp4'));
383+
});
384+
385+
test('strips nested decorative divs', async () => {
386+
const input = dedent`
387+
<div className="outer">
388+
<div className="inner">
389+
390+
Content inside nested divs.
391+
392+
</div>
393+
</div>
394+
`;
395+
396+
const { content: result } = await processMarkdown(input);
397+
398+
assert.ok(!result.includes('<div'));
399+
assert.ok(!result.includes('</div>'));
400+
assert.ok(result.includes('Content inside nested divs.'));
401+
});
402+
403+
test('strips feature-grid div with video list items', async () => {
404+
const input = dedent`
405+
<div className="feature-grid">
406+
407+
- <video playsInline autoPlay muted loop><source src="/assets/formsheet.mp4" /></video>
408+
409+
[Form sheet](#form-sheets)
410+
411+
- <video playsInline autoPlay muted loop><source src="/assets/search-bar.mp4" /></video>
412+
413+
[Search bar](#search-bar)
414+
415+
</div>
416+
`;
417+
418+
const { content: result } = await processMarkdown(input);
419+
420+
assert.ok(!result.includes('<div'));
421+
assert.ok(!result.includes('<video'));
422+
assert.ok(result.includes('/assets/formsheet.mp4'));
423+
assert.ok(result.includes('[Form sheet](#form-sheets)'));
424+
assert.ok(result.includes('/assets/search-bar.mp4'));
425+
assert.ok(result.includes('[Search bar](#search-bar)'));
426+
});
427+
428+
test('preserves HTML inside code fences', async () => {
429+
const input = dedent`
430+
Some text.
431+
432+
\`\`\`jsx
433+
function App() {
434+
return (
435+
<div className="container">
436+
<img src="/logo.png" alt="Logo" />
437+
<video autoPlay>
438+
<source src="/intro.mp4" />
439+
</video>
440+
</div>
441+
);
442+
}
443+
\`\`\`
444+
`;
445+
446+
const { content: result } = await processMarkdown(input);
447+
448+
assert.ok(result.includes('<div className="container">'));
449+
assert.ok(result.includes('<img src="/logo.png" alt="Logo" />'));
450+
assert.ok(result.includes('<source src="/intro.mp4" />'));
451+
assert.ok(result.includes('</div>'));
452+
});
453+
277454
test('strips frontmatter and returns parsed data', async () => {
278455
const input = dedent`
279456
---

src/plugins/process-markdown.ts

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ export type ProcessedMarkdown = {
2727
* - Converts static2dynamic code fences into static + dynamic sections
2828
* - Strips code fence meta attributes (snack, name, static2dynamic, npm2yarn, etc.)
2929
* - Converts Docusaurus admonitions (:::note, :::warning, etc.) to blockquotes
30+
* - Converts HTML <img> tags to markdown image syntax
31+
* - Converts HTML <video> tags to plain video URLs
32+
* - Strips decorative HTML divs (device-frame, image-grid, feature-grid, etc.)
3033
* - Cleans up extra blank lines
3134
*/
3235
export async function processMarkdown(
@@ -52,6 +55,15 @@ export async function processMarkdown(
5255
// Convert admonitions to blockquotes
5356
result = convertAdmonitions(result);
5457

58+
// Convert HTML media tags and strip decorative divs,
59+
// protecting code fences from modification
60+
result = withProtectedCodeFences(result, (text) => {
61+
text = convertImageTags(text);
62+
text = convertVideoTags(text);
63+
text = stripDecorativeDivs(text);
64+
return text;
65+
});
66+
5567
// Clean up extra blank lines (max 2 consecutive)
5668
result = result.replace(/\n{3,}/g, '\n\n');
5769

@@ -371,6 +383,139 @@ function stripCodeFenceMeta(content: string): string {
371383
);
372384
}
373385

386+
/**
387+
* Run a transform function on content while protecting code fences.
388+
* Temporarily replaces code fences with placeholders, runs the transform,
389+
* then restores the original code fences.
390+
*/
391+
function withProtectedCodeFences(
392+
content: string,
393+
transform: (text: string) => string
394+
): string {
395+
const fences: string[] = [];
396+
397+
const placeholder = (i: number) => `\x00CODEFENCE${i}\x00`;
398+
399+
const protected_ = content.replace(/^```[^\n]*\n[\s\S]*?^```$/gm, (match) => {
400+
fences.push(match);
401+
return placeholder(fences.length - 1);
402+
});
403+
404+
const transformed = transform(protected_);
405+
406+
return transformed.replace(/\x00CODEFENCE(\d+)\x00/g, (_match, i) => {
407+
return fences[Number(i)];
408+
});
409+
}
410+
411+
/**
412+
* Convert HTML <img> tags to markdown image syntax.
413+
* Strips inline styles and other attributes, keeping only src and alt.
414+
*/
415+
function convertImageTags(content: string): string {
416+
return content.replace(/<img\s+([^>]*?)\/?>/gi, (match, attrs: string) => {
417+
const srcMatch = attrs.match(/\bsrc=["']([^"']+)["']/);
418+
419+
if (!srcMatch) {
420+
return match;
421+
}
422+
423+
const src = srcMatch[1];
424+
const altMatch = attrs.match(/\balt=["']([^"']*)["']/);
425+
const alt = altMatch ? altMatch[1] : '';
426+
427+
return `![${alt}](${src})`;
428+
});
429+
}
430+
431+
/**
432+
* Convert HTML <video> tags to plain video source URLs.
433+
*/
434+
function convertVideoTags(content: string): string {
435+
return content.replace(
436+
/<video[^>]*>[\s\S]*?<source\s+src=["']([^"']+)["'][^>]*\/?>[\s\S]*?<\/video>/gi,
437+
(_match, src: string) => src
438+
);
439+
}
440+
441+
/**
442+
* Strip decorative HTML divs, keeping their inner content.
443+
* Processes innermost divs first to handle nesting.
444+
*/
445+
function stripDecorativeDivs(content: string): string {
446+
let result = content;
447+
let iterations = 0;
448+
const maxIterations = 50;
449+
450+
while (/<div[\s>]/i.test(result) && iterations < maxIterations) {
451+
const transformed = stripInnermostDiv(result);
452+
453+
if (transformed === result) {
454+
break;
455+
}
456+
457+
result = transformed;
458+
iterations++;
459+
}
460+
461+
return result;
462+
}
463+
464+
/**
465+
* Find and strip the first innermost <div> (one with no nested divs).
466+
* Returns original content if no strippable div is found.
467+
*/
468+
function stripInnermostDiv(content: string): string {
469+
const divOpenRegex = /<div[\s>][^>]*>/gi;
470+
let match: RegExpExecArray | null;
471+
472+
while ((match = divOpenRegex.exec(content)) !== null) {
473+
const openStart = match.index;
474+
const openEnd = openStart + match[0].length;
475+
476+
let depth = 1;
477+
let pos = openEnd;
478+
let closingStart = -1;
479+
480+
while (pos < content.length && depth > 0) {
481+
const nextOpen = content.indexOf('<div', pos);
482+
const nextClose = content.indexOf('</div>', pos);
483+
484+
if (nextClose === -1) {
485+
break;
486+
}
487+
488+
if (nextOpen !== -1 && nextOpen < nextClose) {
489+
depth++;
490+
pos = nextOpen + 4;
491+
} else {
492+
depth--;
493+
494+
if (depth === 0) {
495+
closingStart = nextClose;
496+
}
497+
498+
pos = nextClose + 6;
499+
}
500+
}
501+
502+
if (closingStart === -1) {
503+
continue;
504+
}
505+
506+
const innerContent = content.slice(openEnd, closingStart).trim();
507+
const closingEnd = closingStart + '</div>'.length;
508+
509+
if (!innerContent.includes('<div')) {
510+
return (
511+
content.slice(0, openStart) + innerContent + content.slice(closingEnd)
512+
);
513+
}
514+
}
515+
516+
return content;
517+
}
518+
374519
/**
375520
* Convert Docusaurus admonitions (:::note, :::warning, etc.) to blockquotes.
376521
*

0 commit comments

Comments
 (0)