Content Loading
Load and extract content from multiple sources including web URLs, PDF files, Word documents, and text files with a unified interface.
Why Content Loading?
AI applications often need to process content from various sources. SimplerLLM's content loading tools provide a unified interface for extracting text from:
- Web Articles: Extract clean text from blog posts and news articles
- PDF Documents: Read text from PDF files
- Word Documents: Extract content from .docx files
- Text Files: Load plain text and CSV files
- Metadata Extraction: Get word counts, character counts, and file sizes automatically
Basic Usage
Universal Content Loader
Use load_content()
to automatically detect and load content from any supported source:
from SimplerLLM.tools.generic_loader import load_content
# Load from URL
doc = load_content("https://example.com/blog-post")
print(f"Title: {doc.title}")
print(f"Words: {doc.word_count}")
print(f"Content: {doc.content[:200]}...")
# Load from PDF
doc = load_content("document.pdf")
print(f"Pages extracted: {doc.word_count // 250}") # ~250 words per page
print(f"Content: {doc.content[:200]}...")
# Load from Word document
doc = load_content("report.docx")
print(f"Paragraphs: ~{doc.content.count(chr(10))}")
# Load from text file
doc = load_content("notes.txt")
print(f"Characters: {doc.character_count}")
TextDocument Model
All loaded content returns a TextDocument
object with rich metadata:
from SimplerLLM.tools.generic_loader import load_content
doc = load_content("example.pdf")
# Access content and metadata
print(f"Content: {doc.content}")
print(f"Word Count: {doc.word_count}")
print(f"Character Count: {doc.character_count}")
print(f"File Size: {doc.file_size} bytes")
print(f"Title: {doc.title}") # Available for URLs
print(f"Source: {doc.url_or_path}")
Supported Formats
Web Articles
Extract clean text from web pages, automatically removing ads, navigation, and other noise:
from SimplerLLM.tools.generic_loader import load_content
# Load blog article
url = "https://blog.example.com/ai-trends-2024"
doc = load_content(url)
print(f"Article Title: {doc.title}")
print(f"Reading Time: ~{doc.word_count // 200} minutes")
print(f"\nContent Preview:")
print(doc.content[:500])
PDF Files
from SimplerLLM.tools.generic_loader import load_content
# Load PDF document
doc = load_content("research_paper.pdf")
print(f"Document Stats:")
print(f"- Total Words: {doc.word_count:,}")
print(f"- Total Characters: {doc.character_count:,}")
print(f"- File Size: {doc.file_size / 1024:.2f} KB")
# Use the content with an LLM
from SimplerLLM.language.llm import LLM, LLMProvider
llm = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4o")
summary = llm.generate_response(
prompt=f"Summarize this document in 3 bullet points:\n\n{doc.content}"
)
print(f"\nSummary:\n{summary}")
Word Documents
from SimplerLLM.tools.generic_loader import load_content
# Load Word document
doc = load_content("meeting_notes.docx")
print(f"Document contains {doc.word_count} words")
print(f"\nContent:\n{doc.content}")
Text Files
from SimplerLLM.tools.generic_loader import load_content
# Load text file
doc = load_content("data.txt")
print(f"Lines: ~{doc.content.count(chr(10))}")
print(f"Words: {doc.word_count}")
print(f"Content:\n{doc.content}")
CSV File Handling
For CSV files, use the specialized read_csv_file()
function:
from SimplerLLM.tools.file_loader import read_csv_file
# Load CSV file
csv_doc = read_csv_file("sales_data.csv")
print(f"CSV Stats:")
print(f"- Rows: {csv_doc.row_count}")
print(f"- Columns: {csv_doc.column_count}")
print(f"- Total Fields: {csv_doc.total_fields}")
print(f"- File Size: {csv_doc.file_size / 1024:.2f} KB")
# Access the data
print(f"\nFirst 5 rows:")
for i, row in enumerate(csv_doc.content[:5], 1):
print(f"{i}. {row}")
# Convert to analysis prompt
rows_text = "\n".join([", ".join(row) for row in csv_doc.content[:10]])
prompt = f"Analyze this CSV data:\n\n{rows_text}\n\nProvide insights."
Real-World Examples
RAG System with Multiple Sources
from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.embeddings import EmbeddingsOpenAI
from SimplerLLM.vectors.vector_db import VectorDB
from SimplerLLM.language.llm import LLM, LLMProvider
class MultiSourceRAG:
def __init__(self):
self.embeddings = EmbeddingsOpenAI()
self.vector_db = VectorDB.create(
provider='local',
embeddings_instance=self.embeddings
)
self.llm = LLM.create(
provider=LLMProvider.OPENAI,
model_name="gpt-4o"
)
def add_sources(self, sources):
"""Add multiple sources (URLs, PDFs, docs)"""
for source in sources:
try:
doc = load_content(source)
# Generate embedding
embedding = self.embeddings.generate_embeddings(doc.content)
# Store in vector database
self.vector_db.add(
vector=embedding,
metadata={
'content': doc.content,
'source': doc.url_or_path,
'title': doc.title or 'Untitled',
'word_count': doc.word_count
}
)
print(f"✓ Added: {doc.title or source}")
except Exception as e:
print(f"✗ Failed to load {source}: {e}")
def query(self, question, top_k=3):
"""Answer question using loaded sources"""
# Find relevant sources
query_embedding = self.embeddings.generate_embeddings(question)
results = self.vector_db.search(query_embedding, top_k=top_k)
# Build context
context = "\n\n".join([
f"Source: {r['metadata']['title']}\n{r['metadata']['content'][:500]}..."
for r in results
])
# Generate answer
prompt = f"""Based on these sources, answer the question.
Sources:
{context}
Question: {question}
Answer:"""
answer = self.llm.generate_response(prompt=prompt)
return answer, results
# Usage
rag = MultiSourceRAG()
# Add diverse sources
sources = [
"https://blog.example.com/ai-overview",
"research_paper.pdf",
"meeting_notes.docx",
"technical_spec.txt"
]
rag.add_sources(sources)
# Query the system
answer, sources = rag.query("What are the key AI trends mentioned?")
print(f"Answer: {answer}\n")
print("Sources used:")
for s in sources:
print(f"- {s['metadata']['title']}")
Batch Document Processing
from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.llm import LLM, LLMProvider
import os
class DocumentProcessor:
def __init__(self):
self.llm = LLM.create(
provider=LLMProvider.OPENAI,
model_name="gpt-4o-mini"
)
def process_directory(self, directory_path, task="summarize"):
"""Process all documents in a directory"""
results = []
# Get all supported files
supported_extensions = ['.txt', '.pdf', '.docx', '.csv']
files = [
f for f in os.listdir(directory_path)
if any(f.endswith(ext) for ext in supported_extensions)
]
print(f"Found {len(files)} documents to process...")
for filename in files:
file_path = os.path.join(directory_path, filename)
try:
# Load document
doc = load_content(file_path)
# Process based on task
if task == "summarize":
prompt = f"Summarize in 2 sentences: {doc.content[:2000]}"
elif task == "extract_key_points":
prompt = f"Extract 3 key points: {doc.content[:2000]}"
elif task == "classify":
prompt = f"Classify this document (tech/business/other): {doc.content[:1000]}"
response = self.llm.generate_response(prompt=prompt)
results.append({
'filename': filename,
'word_count': doc.word_count,
'result': response
})
print(f"✓ Processed: {filename}")
except Exception as e:
print(f"✗ Error processing {filename}: {e}")
return results
# Usage
processor = DocumentProcessor()
results = processor.process_directory("./documents", task="summarize")
# Display results
for result in results:
print(f"\n{result['filename']} ({result['word_count']} words)")
print(f"Summary: {result['result']}")
Web Content Aggregator
from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.llm import LLM, LLMProvider
class ContentAggregator:
def __init__(self):
self.llm = LLM.create(
provider=LLMProvider.OPENAI,
model_name="gpt-4o"
)
def aggregate_articles(self, urls, topic):
"""Load multiple articles and create a synthesis"""
articles = []
print(f"Loading {len(urls)} articles...")
for url in urls:
try:
doc = load_content(url)
articles.append({
'title': doc.title,
'url': url,
'content': doc.content,
'word_count': doc.word_count
})
print(f"✓ Loaded: {doc.title}")
except Exception as e:
print(f"✗ Failed to load {url}: {e}")
# Create synthesis
combined_content = "\n\n".join([
f"Article: {a['title']}\n{a['content'][:1000]}..."
for a in articles
])
prompt = f"""Review these articles about {topic} and create a comprehensive summary.
{combined_content}
Provide:
1. Main themes (3-5 points)
2. Key insights
3. Common conclusions
Summary:"""
synthesis = self.llm.generate_response(prompt=prompt)
return synthesis, articles
# Usage
aggregator = ContentAggregator()
urls = [
"https://techblog.example.com/ai-article-1",
"https://news.example.com/ai-article-2",
"https://research.example.com/ai-article-3"
]
synthesis, articles = aggregator.aggregate_articles(
urls,
topic="AI developments in 2024"
)
print("\nSynthesis:")
print(synthesis)
print(f"\nBased on {len(articles)} articles totaling {sum(a['word_count'] for a in articles):,} words")
Best Practices
1. Error Handling
Always wrap load_content()
in try-except blocks. URLs may be unreachable, files may be corrupted, or formats may be unsupported.
2. Check Content Size
Use doc.word_count
to check content size before sending to LLMs. Large documents may need chunking.
3. Cache Loaded Content
Loading content from URLs or parsing large PDFs can be slow. Cache the results if you'll use them multiple times.
4. Validate URLs
When loading from URLs, validate that they're accessible and contain the expected content type before processing.
5. Clean Extracted Text
PDF and web extraction may include formatting artifacts. Consider cleaning the text before using it with LLMs.
Error Handling
from SimplerLLM.tools.generic_loader import load_content
def safe_load_content(source):
"""Safely load content with comprehensive error handling"""
try:
doc = load_content(source)
# Validate loaded content
if not doc.content or len(doc.content.strip()) == 0:
print(f"Warning: {source} loaded but contains no content")
return None
if doc.word_count < 10:
print(f"Warning: {source} has very little content ({doc.word_count} words)")
return doc
except ValueError as e:
print(f"Invalid input or unsupported format: {e}")
return None
except FileNotFoundError:
print(f"File not found: {source}")
return None
except PermissionError:
print(f"Permission denied accessing: {source}")
return None
except Exception as e:
print(f"Unexpected error loading {source}: {e}")
return None
# Usage
sources = ["article.pdf", "https://example.com/blog", "missing.txt"]
for source in sources:
doc = safe_load_content(source)
if doc:
print(f"✓ Loaded {source}: {doc.word_count} words")
Performance Considerations
Loading Speed by Format
- Text Files: Very fast (<1ms for small files)
- Word Documents: Fast (5-50ms)
- PDFs: Moderate (50-500ms depending on size)
- Web URLs: Slow (1-5 seconds due to network latency)
- CSV Files: Fast to moderate (depends on file size)
Next Steps
Text Chunking →
Split loaded content into manageable chunks
Embeddings →
Generate embeddings from loaded content
💡 Pro Tip
Combine content loading with text chunking and embeddings to build powerful RAG systems. Load content from any source, chunk it appropriately, generate embeddings, and store in a vector database for semantic search.