feat: Add support for DOCX and HTML file formats using docling

devin-ai-integration[bot] · PromtEngineer · devin-ai-integration[bot] · commit d5929ce29b40 · 2025-07-21T20:40:39.000Z
- Rename PDFConverter to DocumentConverter with multi-format support
- Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions
- Update indexing pipeline to use DocumentConverter
- Update file validation across all frontend components and scripts
- Preserve existing PDF OCR detection logic
- Add format-specific conversion methods for different document types

Co-Authored-By: PromptEngineer &lt;jnfarooq@outlook.com&gt;
diff --git a/create_index_script.py b/create_index_script.py
@@ -101,7 +101,7 @@ def select_documents(self) -> List[str]:
             elif choice == "2":
                 dir_path = self.get_user_input("Enter directory path")
                 if os.path.isdir(dir_path):
-                    supported_extensions = ['.pdf', '.txt', '.docx', '.md']
+                    supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
                     found_docs = []
                     
                     for ext in supported_extensions:
@@ -369,4 +369,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
+    main()  
diff --git a/demo_batch_indexing.py b/demo_batch_indexing.py
@@ -113,7 +113,7 @@ def validate_documents(self, documents: List[str]) -> List[str]:
             if os.path.exists(doc_path):
                 # Check file extension
                 ext = Path(doc_path).suffix.lower()
-                if ext in ['.pdf', '.txt', '.docx', '.md']:
+                if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
                     valid_documents.append(doc_path)
                     print(f"  ✅ {doc_path}")
                 else:
@@ -383,4 +383,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
+    main()  
diff --git a/rag_system/ingestion/document_converter.py b/rag_system/ingestion/document_converter.py
@@ -0,0 +1,114 @@
+from typing import List, Tuple, Dict, Any
+from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
+from docling.datamodel.base_models import InputFormat
+import fitz  # PyMuPDF for quick text inspection
+import os
+
+class DocumentConverter:
+    """
+    A class to convert various document formats to structured Markdown using the docling library.
+    Supports PDF, DOCX, HTML, and other formats.
+    """
+    
+    # Mapping of file extensions to InputFormat
+    SUPPORTED_FORMATS = {
+        '.pdf': InputFormat.PDF,
+        '.docx': InputFormat.DOCX,
+        '.html': InputFormat.HTML,
+        '.htm': InputFormat.HTML,
+    }
+    
+    def __init__(self):
+        """Initializes the docling document converter with forced OCR enabled for macOS."""
+        try:
+            # --- Converter WITHOUT OCR (fast path) ---
+            pipeline_no_ocr = PdfPipelineOptions()
+            pipeline_no_ocr.do_ocr = False
+            format_no_ocr = {
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
+            }
+            self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)
+
+            # --- Converter WITH OCR (fallback) ---
+            pipeline_ocr = PdfPipelineOptions()
+            pipeline_ocr.do_ocr = True
+            ocr_options = OcrMacOptions(force_full_page_ocr=True)
+            pipeline_ocr.ocr_options = ocr_options
+            format_ocr = {
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
+            }
+            self.converter_ocr = DoclingConverter(format_options=format_ocr)
+            
+            self.converter_general = DoclingConverter()
+
+            print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
+        except Exception as e:
+            print(f"Error initializing docling DocumentConverter(s): {e}")
+            self.converter_no_ocr = None
+            self.converter_ocr = None
+            self.converter_general = None
+
+    def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """
+        Converts a document to a single Markdown string, preserving layout and tables.
+        Supports PDF, DOCX, HTML, and other formats.
+        """
+        if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
+            print("docling converters not available. Skipping conversion.")
+            return []
+        
+        file_ext = os.path.splitext(file_path)[1].lower()
+        if file_ext not in self.SUPPORTED_FORMATS:
+            print(f"Unsupported file format: {file_ext}")
+            return []
+        
+        input_format = self.SUPPORTED_FORMATS[file_ext]
+        
+        if input_format == InputFormat.PDF:
+            return self._convert_pdf_to_markdown(file_path)
+        else:
+            return self._convert_general_to_markdown(file_path, input_format)
+    
+    def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """Convert PDF with OCR detection logic."""
+        # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
+        def _pdf_has_text(path: str) -> bool:
+            try:
+                doc = fitz.open(path)
+                for page in doc:
+                    if page.get_text("text").strip():
+                        return True
+            except Exception:
+                pass
+            return False
+
+        use_ocr = not _pdf_has_text(pdf_path)
+        converter = self.converter_ocr if use_ocr else self.converter_no_ocr
+        ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
+
+        print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
+        return self._perform_conversion(pdf_path, converter, ocr_msg)
+    
+    def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
+        """Convert non-PDF formats using general converter."""
+        print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
+        return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
+    
+    def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """Perform the actual conversion using the specified converter."""
+        pages_data = []
+        try:
+            result = converter.convert(file_path)
+            markdown_content = result.document.export_to_markdown()
+            
+            metadata = {"source": file_path}
+            # Return the *DoclingDocument* object as third tuple element so downstream
+            # chunkers that understand the element tree can use it.  Legacy callers that
+            # expect only (markdown, metadata) can simply ignore the extra value.
+            pages_data.append((markdown_content, metadata, result.document))
+            print(f"Successfully converted {file_path} with docling {format_msg}.")
+            return pages_data
+        except Exception as e:
+            print(f"Error processing {file_path} with docling: {e}")
+            return []
diff --git a/rag_system/ingestion/pdf_converter.py b/rag_system/ingestion/pdf_converter.py
diff --git a/rag_system/pipelines/indexing_pipeline.py b/rag_system/pipelines/indexing_pipeline.py
@@ -1,7 +1,7 @@
 from typing import List, Dict, Any
 import os
 import networkx as nx
-from rag_system.ingestion.pdf_converter import PDFConverter
+from rag_system.ingestion.document_converter import DocumentConverter
 from rag_system.ingestion.chunking import MarkdownRecursiveChunker
 from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
 from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
@@ -15,7 +15,7 @@ def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_c
         self.config = config
         self.llm_client = ollama_client
         self.ollama_config = ollama_config
-        self.pdf_converter = PDFConverter()
+        self.document_converter = DocumentConverter()
         # Chunker selection: docling (token-based) or legacy (character-based)
         chunker_mode = config.get("chunker_mode", "docling")
         
@@ -157,7 +157,7 @@ def run(self, file_paths: List[str] | None = None, *, documents: List[str] | Non
                         document_id = os.path.basename(file_path)
                         print(f"Processing: {document_id}")
                         
-                        pages_data = self.pdf_converter.convert_to_markdown(file_path)
+                        pages_data = self.document_converter.convert_to_markdown(file_path)
                         file_chunks = []
                         
                         for tpl in pages_data:
diff --git a/simple_create_index.sh b/simple_create_index.sh
@@ -71,7 +71,7 @@ validate_documents() {
         if [ -f "$doc" ]; then
             # Check file extension
             case "${doc##*.}" in
-                pdf|txt|docx|md)
+                pdf|txt|docx|md|html|htm)
                     valid_docs+=("$doc")
                     print_status "✓ Valid document: $doc"
                     ;;
@@ -188,7 +188,7 @@ show_usage() {
     echo "  $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
     echo "  $0 \"Invoice Collection\" ./invoices/*.pdf"
     echo ""
-    echo "Supported file types: PDF, TXT, DOCX, MD"
+    echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
 }
 
 # Main script
@@ -225,4 +225,4 @@ main() {
 }
 
 # Run main function with all arguments
-main "$@" 
+main "$@"  
diff --git a/src/components/IndexForm.tsx b/src/components/IndexForm.tsx
@@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) {
             onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
           >
             <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
-            <span className="text-xs text-gray-400">Drag & Drop PDFs here or click to browse</span>
-            <input id="file-upload" type="file" accept="application/pdf" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
+            <span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
+            <input id="file-upload" type="file" accept="application/pdf,.docx,.html,.htm" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
           </label>
           {files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
         </div>
@@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
       </div>
     </div>
   );
-}      
+}            
diff --git a/src/components/IndexWizard.tsx b/src/components/IndexWizard.tsx
@@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {
 
         <div className="space-y-4">
           <div>
-            <label className="block text-sm mb-1">PDF files</label>
-            <input type="file" accept="application/pdf" multiple onChange={handleFile} className="text-sm" />
+            <label className="block text-sm mb-1">Document files</label>
+            <input type="file" accept="application/pdf,.docx,.html,.htm" multiple onChange={handleFile} className="text-sm" />
           </div>
 
           <div className="grid grid-cols-2 gap-4">
@@ -69,4 +69,4 @@ export function IndexWizard({ onClose }: Props) {
       </div>
     </div>
   );
-} 
+}  
diff --git a/src/components/ui/chat-input.tsx b/src/components/ui/chat-input.tsx
@@ -89,8 +89,12 @@ export function ChatInput({
         lastModified: file.lastModified
       });
       
-      // Only allow PDF files for now
-      if (file.type === 'application/pdf') {
+      if (file.type === 'application/pdf' || 
+          file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+          file.type === 'text/html' ||
+          file.name.toLowerCase().endsWith('.html') ||
+          file.name.toLowerCase().endsWith('.htm') ||
+          file.name.toLowerCase().endsWith('.docx')) {
         newFiles.push({
           id: crypto.randomUUID(),
           name: file.name,
@@ -99,7 +103,7 @@ export function ChatInput({
           file: file,
         })
       } else {
-        console.log('🔧 Frontend: File rejected - not PDF:', file.type);
+        console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
       }
     }
 
@@ -153,7 +157,7 @@ export function ChatInput({
 
         <div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
           {/* Hidden file input (kept for future use) */}
-          <input ref={fileInputRef} type="file" accept=".pdf" multiple onChange={handleFileChange} className="hidden" />
+          <input ref={fileInputRef} type="file" accept=".pdf,.docx,.html,.htm" multiple onChange={handleFileChange} className="hidden" />
 
           {/* Textarea */}
           <textarea
@@ -200,4 +204,4 @@ export function ChatInput({
       </form>
     </div>
   )
-} 
+}  
diff --git a/src/components/ui/empty-chat-state.tsx b/src/components/ui/empty-chat-state.tsx
@@ -115,8 +115,12 @@ export function EmptyChatState({
         const newFiles: AttachedFile[] = [];
         for (let i = 0; i < files.length; i++) {
             const file = files[i];
-            // Only allow PDF files for now
-            if (file.type === 'application/pdf') {
+            if (file.type === 'application/pdf' || 
+                file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+                file.type === 'text/html' ||
+                file.name.toLowerCase().endsWith('.html') ||
+                file.name.toLowerCase().endsWith('.htm') ||
+                file.name.toLowerCase().endsWith('.docx')) {
                 newFiles.push({
                     id: crypto.randomUUID(),
                     name: file.name,
@@ -220,7 +224,7 @@ export function EmptyChatState({
                     <input
                         ref={fileInputRef}
                         type="file"
-                        accept=".pdf"
+                        accept=".pdf,.docx,.html,.htm"
                         multiple
                         onChange={handleFileChange}
                         className="hidden"
@@ -278,4 +282,4 @@ export function EmptyChatState({
             </div>
         </div>
     );
-} 
+}  
diff --git a/src/test-upload.html b/src/test-upload.html