/** Largest document we read client-side for a knowledge base. */ export const MAX_DOCUMENT_BYTES = 5 * 1024 * 1024; /** File picker accept string for the supported document types. */ export const DOCUMENT_ACCEPT = '.txt,.csv,.docx'; /** Outcome of {@link extractDocumentText}: text on success, else a message. */ export interface DocumentTextResult { text?: string; error?: string; } /** * Read a .txt / .csv / .docx file into plain text, truncated to ``maxChars``. * Plain text and CSV are decoded directly; .docx is parsed in-browser with * mammoth (loaded on demand). Other types and oversized files return a * user-facing error instead of throwing. * * @param file {File} The selected file. * @param maxChars {number} Hard cap on the returned text length. * @returns {Promise} The extracted text or an error message. */ export const extractDocumentText = async ( file: File, maxChars: number ): Promise => { if (file.size > MAX_DOCUMENT_BYTES) { return { error: 'That file is over 5 MB. Upload a smaller export.' }; } const name: string = file.name.toLowerCase(); try { let text = ''; if (name.endsWith('.docx')) { const mammoth = (await import('mammoth')).default; const arrayBuffer: ArrayBuffer = await file.arrayBuffer(); const result = await mammoth.extractRawText({ arrayBuffer }); text = result.value; } else if ( name.endsWith('.txt') || name.endsWith('.csv') || file.type === 'text/plain' || file.type === 'text/csv' ) { text = await file.text(); } else { return { error: 'Upload a .txt, .csv or .docx file.' }; } const trimmed: string = text.trim(); if (!trimmed) { return { error: 'That file looks empty - there was no text to read.' }; } return { text: trimmed.slice(0, maxChars) }; } catch { return { error: 'Could not read that file. Paste the text instead.' }; } };