Skip to content

Commit d5929ce

Browse files
feat: Add support for DOCX and HTML file formats using docling
- Rename PDFConverter to DocumentConverter with multi-format support - Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions - Update indexing pipeline to use DocumentConverter - Update file validation across all frontend components and scripts - Preserve existing PDF OCR detection logic - Add format-specific conversion methods for different document types Co-Authored-By: PromptEngineer <[email protected]>
1 parent 6f69e61 commit d5929ce

File tree

11 files changed

+149
-103
lines changed

11 files changed

+149
-103
lines changed

create_index_script.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def select_documents(self) -> List[str]:
101101
elif choice == "2":
102102
dir_path = self.get_user_input("Enter directory path")
103103
if os.path.isdir(dir_path):
104-
supported_extensions = ['.pdf', '.txt', '.docx', '.md']
104+
supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
105105
found_docs = []
106106

107107
for ext in supported_extensions:
@@ -369,4 +369,4 @@ def main():
369369

370370

371371
if __name__ == "__main__":
372-
main()
372+
main()

demo_batch_indexing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def validate_documents(self, documents: List[str]) -> List[str]:
113113
if os.path.exists(doc_path):
114114
# Check file extension
115115
ext = Path(doc_path).suffix.lower()
116-
if ext in ['.pdf', '.txt', '.docx', '.md']:
116+
if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
117117
valid_documents.append(doc_path)
118118
print(f" ✅ {doc_path}")
119119
else:
@@ -383,4 +383,4 @@ def main():
383383

384384

385385
if __name__ == "__main__":
386-
main()
386+
main()
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from typing import List, Tuple, Dict, Any
2+
from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
3+
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
4+
from docling.datamodel.base_models import InputFormat
5+
import fitz # PyMuPDF for quick text inspection
6+
import os
7+
8+
class DocumentConverter:
9+
"""
10+
A class to convert various document formats to structured Markdown using the docling library.
11+
Supports PDF, DOCX, HTML, and other formats.
12+
"""
13+
14+
# Mapping of file extensions to InputFormat
15+
SUPPORTED_FORMATS = {
16+
'.pdf': InputFormat.PDF,
17+
'.docx': InputFormat.DOCX,
18+
'.html': InputFormat.HTML,
19+
'.htm': InputFormat.HTML,
20+
}
21+
22+
def __init__(self):
23+
"""Initializes the docling document converter with forced OCR enabled for macOS."""
24+
try:
25+
# --- Converter WITHOUT OCR (fast path) ---
26+
pipeline_no_ocr = PdfPipelineOptions()
27+
pipeline_no_ocr.do_ocr = False
28+
format_no_ocr = {
29+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
30+
}
31+
self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)
32+
33+
# --- Converter WITH OCR (fallback) ---
34+
pipeline_ocr = PdfPipelineOptions()
35+
pipeline_ocr.do_ocr = True
36+
ocr_options = OcrMacOptions(force_full_page_ocr=True)
37+
pipeline_ocr.ocr_options = ocr_options
38+
format_ocr = {
39+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
40+
}
41+
self.converter_ocr = DoclingConverter(format_options=format_ocr)
42+
43+
self.converter_general = DoclingConverter()
44+
45+
print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
46+
except Exception as e:
47+
print(f"Error initializing docling DocumentConverter(s): {e}")
48+
self.converter_no_ocr = None
49+
self.converter_ocr = None
50+
self.converter_general = None
51+
52+
def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
53+
"""
54+
Converts a document to a single Markdown string, preserving layout and tables.
55+
Supports PDF, DOCX, HTML, and other formats.
56+
"""
57+
if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
58+
print("docling converters not available. Skipping conversion.")
59+
return []
60+
61+
file_ext = os.path.splitext(file_path)[1].lower()
62+
if file_ext not in self.SUPPORTED_FORMATS:
63+
print(f"Unsupported file format: {file_ext}")
64+
return []
65+
66+
input_format = self.SUPPORTED_FORMATS[file_ext]
67+
68+
if input_format == InputFormat.PDF:
69+
return self._convert_pdf_to_markdown(file_path)
70+
else:
71+
return self._convert_general_to_markdown(file_path, input_format)
72+
73+
def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
74+
"""Convert PDF with OCR detection logic."""
75+
# Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
76+
def _pdf_has_text(path: str) -> bool:
77+
try:
78+
doc = fitz.open(path)
79+
for page in doc:
80+
if page.get_text("text").strip():
81+
return True
82+
except Exception:
83+
pass
84+
return False
85+
86+
use_ocr = not _pdf_has_text(pdf_path)
87+
converter = self.converter_ocr if use_ocr else self.converter_no_ocr
88+
ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
89+
90+
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
91+
return self._perform_conversion(pdf_path, converter, ocr_msg)
92+
93+
def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
94+
"""Convert non-PDF formats using general converter."""
95+
print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
96+
return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
97+
98+
def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
99+
"""Perform the actual conversion using the specified converter."""
100+
pages_data = []
101+
try:
102+
result = converter.convert(file_path)
103+
markdown_content = result.document.export_to_markdown()
104+
105+
metadata = {"source": file_path}
106+
# Return the *DoclingDocument* object as third tuple element so downstream
107+
# chunkers that understand the element tree can use it. Legacy callers that
108+
# expect only (markdown, metadata) can simply ignore the extra value.
109+
pages_data.append((markdown_content, metadata, result.document))
110+
print(f"Successfully converted {file_path} with docling {format_msg}.")
111+
return pages_data
112+
except Exception as e:
113+
print(f"Error processing {file_path} with docling: {e}")
114+
return []

rag_system/ingestion/pdf_converter.py

Lines changed: 0 additions & 76 deletions
This file was deleted.

rag_system/pipelines/indexing_pipeline.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import List, Dict, Any
22
import os
33
import networkx as nx
4-
from rag_system.ingestion.pdf_converter import PDFConverter
4+
from rag_system.ingestion.document_converter import DocumentConverter
55
from rag_system.ingestion.chunking import MarkdownRecursiveChunker
66
from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
77
from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
@@ -15,7 +15,7 @@ def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_c
1515
self.config = config
1616
self.llm_client = ollama_client
1717
self.ollama_config = ollama_config
18-
self.pdf_converter = PDFConverter()
18+
self.document_converter = DocumentConverter()
1919
# Chunker selection: docling (token-based) or legacy (character-based)
2020
chunker_mode = config.get("chunker_mode", "docling")
2121

@@ -157,7 +157,7 @@ def run(self, file_paths: List[str] | None = None, *, documents: List[str] | Non
157157
document_id = os.path.basename(file_path)
158158
print(f"Processing: {document_id}")
159159

160-
pages_data = self.pdf_converter.convert_to_markdown(file_path)
160+
pages_data = self.document_converter.convert_to_markdown(file_path)
161161
file_chunks = []
162162

163163
for tpl in pages_data:

simple_create_index.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ validate_documents() {
7171
if [ -f "$doc" ]; then
7272
# Check file extension
7373
case "${doc##*.}" in
74-
pdf|txt|docx|md)
74+
pdf|txt|docx|md|html|htm)
7575
valid_docs+=("$doc")
7676
print_status "✓ Valid document: $doc"
7777
;;
@@ -188,7 +188,7 @@ show_usage() {
188188
echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
189189
echo " $0 \"Invoice Collection\" ./invoices/*.pdf"
190190
echo ""
191-
echo "Supported file types: PDF, TXT, DOCX, MD"
191+
echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
192192
}
193193

194194
# Main script
@@ -225,4 +225,4 @@ main() {
225225
}
226226

227227
# Run main function with all arguments
228-
main "$@"
228+
main "$@"

src/components/IndexForm.tsx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) {
9898
onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
9999
>
100100
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
101-
<span className="text-xs text-gray-400">Drag & Drop PDFs here or click to browse</span>
102-
<input id="file-upload" type="file" accept="application/pdf" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
101+
<span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
102+
<input id="file-upload" type="file" accept="application/pdf,.docx,.html,.htm" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
103103
</label>
104104
{files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
105105
</div>
@@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
220220
</div>
221221
</div>
222222
);
223-
}
223+
}

src/components/IndexWizard.tsx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {
2424

2525
<div className="space-y-4">
2626
<div>
27-
<label className="block text-sm mb-1">PDF files</label>
28-
<input type="file" accept="application/pdf" multiple onChange={handleFile} className="text-sm" />
27+
<label className="block text-sm mb-1">Document files</label>
28+
<input type="file" accept="application/pdf,.docx,.html,.htm" multiple onChange={handleFile} className="text-sm" />
2929
</div>
3030

3131
<div className="grid grid-cols-2 gap-4">
@@ -69,4 +69,4 @@ export function IndexWizard({ onClose }: Props) {
6969
</div>
7070
</div>
7171
);
72-
}
72+
}

src/components/ui/chat-input.tsx

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,12 @@ export function ChatInput({
8989
lastModified: file.lastModified
9090
});
9191

92-
// Only allow PDF files for now
93-
if (file.type === 'application/pdf') {
92+
if (file.type === 'application/pdf' ||
93+
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
94+
file.type === 'text/html' ||
95+
file.name.toLowerCase().endsWith('.html') ||
96+
file.name.toLowerCase().endsWith('.htm') ||
97+
file.name.toLowerCase().endsWith('.docx')) {
9498
newFiles.push({
9599
id: crypto.randomUUID(),
96100
name: file.name,
@@ -99,7 +103,7 @@ export function ChatInput({
99103
file: file,
100104
})
101105
} else {
102-
console.log('🔧 Frontend: File rejected - not PDF:', file.type);
106+
console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
103107
}
104108
}
105109

@@ -153,7 +157,7 @@ export function ChatInput({
153157

154158
<div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
155159
{/* Hidden file input (kept for future use) */}
156-
<input ref={fileInputRef} type="file" accept=".pdf" multiple onChange={handleFileChange} className="hidden" />
160+
<input ref={fileInputRef} type="file" accept=".pdf,.docx,.html,.htm" multiple onChange={handleFileChange} className="hidden" />
157161

158162
{/* Textarea */}
159163
<textarea
@@ -200,4 +204,4 @@ export function ChatInput({
200204
</form>
201205
</div>
202206
)
203-
}
207+
}

src/components/ui/empty-chat-state.tsx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,12 @@ export function EmptyChatState({
115115
const newFiles: AttachedFile[] = [];
116116
for (let i = 0; i < files.length; i++) {
117117
const file = files[i];
118-
// Only allow PDF files for now
119-
if (file.type === 'application/pdf') {
118+
if (file.type === 'application/pdf' ||
119+
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
120+
file.type === 'text/html' ||
121+
file.name.toLowerCase().endsWith('.html') ||
122+
file.name.toLowerCase().endsWith('.htm') ||
123+
file.name.toLowerCase().endsWith('.docx')) {
120124
newFiles.push({
121125
id: crypto.randomUUID(),
122126
name: file.name,
@@ -220,7 +224,7 @@ export function EmptyChatState({
220224
<input
221225
ref={fileInputRef}
222226
type="file"
223-
accept=".pdf"
227+
accept=".pdf,.docx,.html,.htm"
224228
multiple
225229
onChange={handleFileChange}
226230
className="hidden"
@@ -278,4 +282,4 @@ export function EmptyChatState({
278282
</div>
279283
</div>
280284
);
281-
}
285+
}

0 commit comments

Comments
 (0)