Basic Examples
Simple PDF Conversion
Convert a PDF to Markdown:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("document.pdf")
print(result.document.export_to_markdown())
Convert from URL
Convert a document directly from a URL:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("https://arxiv.org/pdf/2408.09869")
markdown = result.document.export_to_markdown()
# Save to file
with open("output.md", "w") as f:
f.write(markdown)
Multiple Export Formats
Export the same document to multiple formats:
from docling.document_converter import DocumentConverter
import json
converter = DocumentConverter()
result = converter.convert("document.pdf")
doc = result.document
# Export to different formats
with open("output.md", "w") as f:
f.write(doc.export_to_markdown())
with open("output.html", "w") as f:
f.write(doc.export_to_html())
with open("output.json", "w") as f:
json.dump(doc.export_to_dict(), f, indent=2)
RAG Applications
Prepare Documents for RAG
Convert documents for use in Retrieval-Augmented Generation systems:
from docling.document_converter import DocumentConverter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Convert document
converter = DocumentConverter()
result = converter.convert("document.pdf")
markdown = result.document.export_to_markdown()
# Split into chunks for RAG
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.split_text(markdown)
# Create embeddings and store in vector database
# ... your RAG pipeline ...
Using with LangChain
Integrate Docling with LangChain for RAG:
from langchain_community.document_loaders import DoclingLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
# Load document with Docling
loader = DoclingLoader("document.pdf")
documents = loader.load()
# Create embeddings
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
# Use in RAG chain
# ... your RAG chain ...
Document Analysis
Extract Document Statistics
Analyze document structure and content:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("document.pdf")
doc = result.document
# Document statistics
stats = {
"pages": len(doc.pages),
"tables": len(doc.tables),
"images": len(doc.images),
"total_elements": sum(len(page.content) for page in doc.pages)
}
print(f"Document Statistics:")
for key, value in stats.items():
print(f" {key}: {value}")
Extract All Tables
Extract and process all tables from a document:
from docling.document_converter import DocumentConverter
import pandas as pd
converter = DocumentConverter()
result = converter.convert("document.pdf")
doc = result.document
# Extract tables
for i, table in enumerate(doc.tables):
print(f"Table {i+1}:")
print(table.export_to_markdown())
# Convert to pandas DataFrame (if needed)
# df = pd.read_html(table.export_to_html())[0]
print()
Extract Images
Extract and save images from documents:
from docling.document_converter import DocumentConverter
import os
converter = DocumentConverter()
result = converter.convert("document.pdf")
doc = result.document
# Create output directory
os.makedirs("extracted_images", exist_ok=True)
# Extract images
for i, image in enumerate(doc.images):
output_path = f"extracted_images/image_{i+1}.png"
# Save image (implementation depends on image object structure)
print(f"Extracted: {image.filename} -> {output_path}")
Batch Processing
Process Multiple Documents
Convert multiple documents in a directory:
from docling.document_converter import DocumentConverter
import os
from pathlib import Path
converter = DocumentConverter()
input_dir = Path("documents")
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)
# Process all PDFs in directory
for pdf_file in input_dir.glob("*.pdf"):
try:
result = converter.convert(str(pdf_file))
output_file = output_dir / f"{pdf_file.stem}.md"
with open(output_file, "w") as f:
f.write(result.document.export_to_markdown())
print(f"Converted: {pdf_file.name} -> {output_file.name}")
except Exception as e:
print(f"Error processing {pdf_file.name}: {e}")
Parallel Processing
Process multiple documents in parallel:
from docling.document_converter import DocumentConverter
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
def convert_document(pdf_path):
converter = DocumentConverter()
try:
result = converter.convert(str(pdf_path))
return pdf_path.name, result.document.export_to_markdown()
except Exception as e:
return pdf_path.name, None
# Process documents in parallel
pdf_files = list(Path("documents").glob("*.pdf"))
with ThreadPoolExecutor(max_workers=4) as executor:
futures = {executor.submit(convert_document, pdf): pdf for pdf in pdf_files}
for future in as_completed(futures):
name, markdown = future.result()
if markdown:
with open(f"output/{name}.md", "w") as f:
f.write(markdown)
print(f"Converted: {name}")
Advanced Examples
Using VLM Pipeline
Use Visual Language Models for enhanced understanding:
from docling.document_converter import DocumentConverter
# Use VLM pipeline with GraniteDocling
converter = DocumentConverter(
pipeline="vlm",
vlm_model="granite_docling"
)
result = converter.convert("complex_document.pdf")
print(result.document.export_to_markdown())
On Apple Silicon, MLX acceleration is automatically used.
Custom Processing Pipeline
Create a custom processing workflow:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
def process_document(input_path, output_format="markdown"):
converter = DocumentConverter(
format=InputFormat.PDF,
# Add custom configuration
)
result = converter.convert(input_path)
doc = result.document
if output_format == "markdown":
return doc.export_to_markdown()
elif output_format == "html":
return doc.export_to_html()
elif output_format == "json":
import json
return json.dumps(doc.export_to_dict(), indent=2)
else:
raise ValueError(f"Unknown format: {output_format}")
# Use the function
markdown = process_document("document.pdf", "markdown")
Error Handling and Retry Logic
Robust error handling for production use:
from docling.document_converter import DocumentConverter
from docling.exceptions import DoclingError
import time
def convert_with_retry(source, max_retries=3):
converter = DocumentConverter()
for attempt in range(max_retries):
try:
result = converter.convert(source)
return result.document
except DoclingError as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Attempt {attempt + 1} failed, retrying in {wait_time}s...")
time.sleep(wait_time)
else:
raise
return None
# Use with retry logic
doc = convert_with_retry("document.pdf")
Integration Examples
LlamaIndex Integration
Use Docling with LlamaIndex:
from llama_index.readers.docling import DoclingReader
from llama_index import VectorStoreIndex
# Load documents
reader = DoclingReader()
documents = reader.load_data("document.pdf")
# Create index
index = VectorStoreIndex.from_documents(documents)
# Query
query_engine = index.as_query_engine()
response = query_engine.query("What is this document about?")
print(response)
Haystack Integration
Use Docling with Haystack:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import DoclingConverter
# Convert document
converter = DoclingConverter()
documents = converter.convert("document.pdf")
# Store in document store
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
# Use in Haystack pipeline
# ... your Haystack pipeline ...
CLI Examples
Basic CLI Usage
Convert documents from the command line:
# Convert from URL
docling https://arxiv.org/pdf/2206.01062
# Convert local file
docling document.pdf
# Specify output file
docling document.pdf -o output.md
VLM Pipeline via CLI
Use VLM pipeline from command line:
docling --pipeline vlm --vlm-model granite_docling document.pdf
Learn more about the CLI.
More Resources
- Getting Started Guide
- Complete Documentation
- Framework Integrations
- GitHub Repository - More examples in the repo