/** * External dependencies */ import { uniq } from 'lodash'; import type { AiContentMetrics, AiContentPiece, Maybe } from '@nab/types'; const BUTTON_LABEL_MAX_LENGTH = 80; const PARAGRAPH_MAX_LENGTH = 200; export function getContentMetrics( homeUrl: string, htmlString?: string ): Maybe< AiContentMetrics > { if ( ! htmlString ) { return undefined; } try { const parser = new DOMParser(); const doc = parser.parseFromString( htmlString, 'text/html' ); const forms = doc.querySelectorAll( 'form' ); const buttons = [ ...doc.querySelectorAll( 'button, input[type="submit"], a[class*="button"], a[class*="cta"]' ), ] .map( ( b ) => textContent( b, BUTTON_LABEL_MAX_LENGTH ) ) .filter( ( t ) => !! t ); const bodyText = doc.body.innerText.toLowerCase(); // Count media const images = doc.querySelectorAll( 'img' ); const videos = doc.querySelectorAll( 'video, iframe[src*="youtube"], iframe[src*="vimeo"]' ); // Internal links const internalLinks = Array.from( doc.querySelectorAll( 'a' ) ).filter( ( link ) => { const href = link.getAttribute( 'href' ) || ''; return href.startsWith( '/' ) || href.includes( homeUrl ); } ); return { formCount: forms.length, buttonCount: buttons.length, buttonLabels: uniq( buttons ), imageCount: images.length, videoCount: videos.length, internalLinkCount: internalLinks.length, wordCount: bodyText.split( /\s+/ ).length, }; } catch ( _ ) { return undefined; } } export async function getContentSummary( url: string ): Promise< ReadonlyArray< AiContentPiece > > { try { const html = await getHtml( url ); if ( ! html ) { return []; } const parser = new DOMParser(); const document = parser.parseFromString( html, 'text/html' ); const main = document.querySelector( 'main, article, #content, .site-main' ) || document.body; // Remove nav/aside/footer _inside main_ before traversing. [ 'nav', 'aside', 'footer', '.sidebar', '.navigation' ].forEach( ( sel ) => { main.querySelectorAll( sel ).forEach( ( el ) => el.remove() ); } ); // Find and summarize h1s outside main const h1sOutsideMain = findH1sOutsideMain( document, main ); // Summarize main content area const mainContent = summarizeDomNode( main, [], document ); // Combine, and limit total result length return [ ...h1sOutsideMain, ...mainContent ]; } catch ( _ ) { return []; } } // ======= // HELPERS // ======= async function getHtml( url: string ): Promise< string > { try { const response = await fetch( url ); const html = await response.text(); return html || ''; } catch ( _ ) { return ''; } } function summarizeDomNode( node: Node, pieces: AiContentPiece[] = [], doc?: Document ): AiContentPiece[] { if ( node.nodeType === node.ELEMENT_NODE ) { const el = node as HTMLElement; const tag = el.tagName.toLowerCase(); // Prune unwanted sections but process children for e.g. 'main' if ( [ 'nav', 'aside', 'footer', 'script', 'style', 'noscript', ].includes( tag ) ) { return pieces; } // Headings if ( /^h[1-6]$/.test( tag ) ) { pieces.push( { type: 'heading', text: textContent( el ), level: parseInt( tag.replace( 'h', '' ), 10 ), } ); return pieces; } // Paragraph and list if ( [ 'p', 'li' ].includes( tag ) ) { const text = textContent( el, PARAGRAPH_MAX_LENGTH ); if ( text.length > 0 ) { pieces.push( { type: 'paragraph', text: text.slice( 0, 500 ), } ); } return pieces; } // Call-To-Action Detection if ( isCTA( el ) ) { pieces.push( { type: 'button', text: textContent( el ), } ); return pieces; } // Link (excluding anchor-CTA already included above) if ( tag === 'a' && textContent( el ) ) { pieces.push( { type: 'link', text: textContent( el ), href: el.getAttribute( 'href' ) || '', } ); return pieces; } // Images if ( tag === 'img' ) { pieces.push( { type: 'image', alt: el.getAttribute( 'alt' ) || '', } ); return pieces; } // Forms if ( tag === 'form' ) { const summary = Array.from( el.querySelectorAll( 'legend,label' ) ) .map( ( lbl ) => textContent( lbl ) ) .filter( Boolean ) .join( '; ' ) || 'form'; pieces.push( { type: 'form', summary } ); // still want to catch inputs below } // Inputs (inputs, selects, textareas) if ( [ 'input', 'textarea', 'select' ].includes( tag ) ) { // Try to get label let label = ''; if ( el.id && doc ) { const labelEl = doc.querySelector( `label[for="${ el.id }"]` ); if ( labelEl ) { label = textContent( labelEl ); } } if ( el.getAttribute( 'type' ) === 'hidden' ) { // Skip hidden inputs return pieces; } pieces.push( { type: 'input', label: label || el.getAttribute( 'name' ) || tag, inputType: el.getAttribute( 'type' ) || tag, } ); return pieces; } // Recurse Array.from( node.childNodes ).forEach( ( child ) => summarizeDomNode( child, pieces, doc ) ); return pieces; } else if ( node.nodeType === node.TEXT_NODE ) { const text = textContent( node ); if ( text && text.length > 30 ) { // Only capture long, visible text nodes pieces.push( { type: 'paragraph', text } ); } } return pieces; } function findH1sOutsideMain( document: Document, main: Element ): AiContentPiece[] { const h1s: AiContentPiece[] = []; const mainSet = new Set( Array.from( main.querySelectorAll( '*' ) ) ); const allH1s = Array.from( document.querySelectorAll( 'h1' ) ); allH1s.forEach( ( h1 ) => { // Is this h1 outside the main area? if ( ! mainSet.has( h1 ) ) { h1s.push( { type: 'heading', text: textContent( h1 ), level: 1, } ); } } ); return h1s; } function isCTA( el: Element ): boolean { const tag = el.tagName.toLowerCase(); // 1.