Content Loading

Load and extract content from multiple sources including web URLs, PDF files, Word documents, and text files with a unified interface.

Why Content Loading?

AI applications often need to process content from various sources. SimplerLLM's content loading tools provide a unified interface for extracting text from:

  • Web Articles: Extract clean text from blog posts and news articles
  • PDF Documents: Read text from PDF files
  • Word Documents: Extract content from .docx files
  • Text Files: Load plain text and CSV files
  • Metadata Extraction: Get word counts, character counts, and file sizes automatically

Basic Usage

Universal Content Loader

Use load_content() to automatically detect and load content from any supported source:

from SimplerLLM.tools.generic_loader import load_content

# Load from URL
doc = load_content("https://example.com/blog-post")
print(f"Title: {doc.title}")
print(f"Words: {doc.word_count}")
print(f"Content: {doc.content[:200]}...")

# Load from PDF
doc = load_content("document.pdf")
print(f"Pages extracted: {doc.word_count // 250}")  # ~250 words per page
print(f"Content: {doc.content[:200]}...")

# Load from Word document
doc = load_content("report.docx")
print(f"Paragraphs: ~{doc.content.count(chr(10))}")

# Load from text file
doc = load_content("notes.txt")
print(f"Characters: {doc.character_count}")

TextDocument Model

All loaded content returns a TextDocument object with rich metadata:

from SimplerLLM.tools.generic_loader import load_content

doc = load_content("example.pdf")

# Access content and metadata
print(f"Content: {doc.content}")
print(f"Word Count: {doc.word_count}")
print(f"Character Count: {doc.character_count}")
print(f"File Size: {doc.file_size} bytes")
print(f"Title: {doc.title}")  # Available for URLs
print(f"Source: {doc.url_or_path}")

Supported Formats

Web Articles

Extract clean text from web pages, automatically removing ads, navigation, and other noise:

from SimplerLLM.tools.generic_loader import load_content

# Load blog article
url = "https://blog.example.com/ai-trends-2024"
doc = load_content(url)

print(f"Article Title: {doc.title}")
print(f"Reading Time: ~{doc.word_count // 200} minutes")
print(f"\nContent Preview:")
print(doc.content[:500])

PDF Files

from SimplerLLM.tools.generic_loader import load_content

# Load PDF document
doc = load_content("research_paper.pdf")

print(f"Document Stats:")
print(f"- Total Words: {doc.word_count:,}")
print(f"- Total Characters: {doc.character_count:,}")
print(f"- File Size: {doc.file_size / 1024:.2f} KB")

# Use the content with an LLM
from SimplerLLM.language.llm import LLM, LLMProvider

llm = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4o")
summary = llm.generate_response(
    prompt=f"Summarize this document in 3 bullet points:\n\n{doc.content}"
)
print(f"\nSummary:\n{summary}")

Word Documents

from SimplerLLM.tools.generic_loader import load_content

# Load Word document
doc = load_content("meeting_notes.docx")

print(f"Document contains {doc.word_count} words")
print(f"\nContent:\n{doc.content}")

Text Files

from SimplerLLM.tools.generic_loader import load_content

# Load text file
doc = load_content("data.txt")

print(f"Lines: ~{doc.content.count(chr(10))}")
print(f"Words: {doc.word_count}")
print(f"Content:\n{doc.content}")

CSV File Handling

For CSV files, use the specialized read_csv_file() function:

from SimplerLLM.tools.file_loader import read_csv_file

# Load CSV file
csv_doc = read_csv_file("sales_data.csv")

print(f"CSV Stats:")
print(f"- Rows: {csv_doc.row_count}")
print(f"- Columns: {csv_doc.column_count}")
print(f"- Total Fields: {csv_doc.total_fields}")
print(f"- File Size: {csv_doc.file_size / 1024:.2f} KB")

# Access the data
print(f"\nFirst 5 rows:")
for i, row in enumerate(csv_doc.content[:5], 1):
    print(f"{i}. {row}")

# Convert to analysis prompt
rows_text = "\n".join([", ".join(row) for row in csv_doc.content[:10]])
prompt = f"Analyze this CSV data:\n\n{rows_text}\n\nProvide insights."

Real-World Examples

RAG System with Multiple Sources

from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.embeddings import EmbeddingsOpenAI
from SimplerLLM.vectors.vector_db import VectorDB
from SimplerLLM.language.llm import LLM, LLMProvider

class MultiSourceRAG:
    def __init__(self):
        self.embeddings = EmbeddingsOpenAI()
        self.vector_db = VectorDB.create(
            provider='local',
            embeddings_instance=self.embeddings
        )
        self.llm = LLM.create(
            provider=LLMProvider.OPENAI,
            model_name="gpt-4o"
        )

    def add_sources(self, sources):
        """Add multiple sources (URLs, PDFs, docs)"""
        for source in sources:
            try:
                doc = load_content(source)

                # Generate embedding
                embedding = self.embeddings.generate_embeddings(doc.content)

                # Store in vector database
                self.vector_db.add(
                    vector=embedding,
                    metadata={
                        'content': doc.content,
                        'source': doc.url_or_path,
                        'title': doc.title or 'Untitled',
                        'word_count': doc.word_count
                    }
                )
                print(f"✓ Added: {doc.title or source}")
            except Exception as e:
                print(f"✗ Failed to load {source}: {e}")

    def query(self, question, top_k=3):
        """Answer question using loaded sources"""
        # Find relevant sources
        query_embedding = self.embeddings.generate_embeddings(question)
        results = self.vector_db.search(query_embedding, top_k=top_k)

        # Build context
        context = "\n\n".join([
            f"Source: {r['metadata']['title']}\n{r['metadata']['content'][:500]}..."
            for r in results
        ])

        # Generate answer
        prompt = f"""Based on these sources, answer the question.

Sources:
{context}

Question: {question}

Answer:"""

        answer = self.llm.generate_response(prompt=prompt)
        return answer, results

# Usage
rag = MultiSourceRAG()

# Add diverse sources
sources = [
    "https://blog.example.com/ai-overview",
    "research_paper.pdf",
    "meeting_notes.docx",
    "technical_spec.txt"
]
rag.add_sources(sources)

# Query the system
answer, sources = rag.query("What are the key AI trends mentioned?")
print(f"Answer: {answer}\n")
print("Sources used:")
for s in sources:
    print(f"- {s['metadata']['title']}")

Batch Document Processing

from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.llm import LLM, LLMProvider
import os

class DocumentProcessor:
    def __init__(self):
        self.llm = LLM.create(
            provider=LLMProvider.OPENAI,
            model_name="gpt-4o-mini"
        )

    def process_directory(self, directory_path, task="summarize"):
        """Process all documents in a directory"""
        results = []

        # Get all supported files
        supported_extensions = ['.txt', '.pdf', '.docx', '.csv']
        files = [
            f for f in os.listdir(directory_path)
            if any(f.endswith(ext) for ext in supported_extensions)
        ]

        print(f"Found {len(files)} documents to process...")

        for filename in files:
            file_path = os.path.join(directory_path, filename)

            try:
                # Load document
                doc = load_content(file_path)

                # Process based on task
                if task == "summarize":
                    prompt = f"Summarize in 2 sentences: {doc.content[:2000]}"
                elif task == "extract_key_points":
                    prompt = f"Extract 3 key points: {doc.content[:2000]}"
                elif task == "classify":
                    prompt = f"Classify this document (tech/business/other): {doc.content[:1000]}"

                response = self.llm.generate_response(prompt=prompt)

                results.append({
                    'filename': filename,
                    'word_count': doc.word_count,
                    'result': response
                })

                print(f"✓ Processed: {filename}")

            except Exception as e:
                print(f"✗ Error processing {filename}: {e}")

        return results

# Usage
processor = DocumentProcessor()
results = processor.process_directory("./documents", task="summarize")

# Display results
for result in results:
    print(f"\n{result['filename']} ({result['word_count']} words)")
    print(f"Summary: {result['result']}")

Web Content Aggregator

from SimplerLLM.tools.generic_loader import load_content
from SimplerLLM.language.llm import LLM, LLMProvider

class ContentAggregator:
    def __init__(self):
        self.llm = LLM.create(
            provider=LLMProvider.OPENAI,
            model_name="gpt-4o"
        )

    def aggregate_articles(self, urls, topic):
        """Load multiple articles and create a synthesis"""
        articles = []

        print(f"Loading {len(urls)} articles...")
        for url in urls:
            try:
                doc = load_content(url)
                articles.append({
                    'title': doc.title,
                    'url': url,
                    'content': doc.content,
                    'word_count': doc.word_count
                })
                print(f"✓ Loaded: {doc.title}")
            except Exception as e:
                print(f"✗ Failed to load {url}: {e}")

        # Create synthesis
        combined_content = "\n\n".join([
            f"Article: {a['title']}\n{a['content'][:1000]}..."
            for a in articles
        ])

        prompt = f"""Review these articles about {topic} and create a comprehensive summary.

{combined_content}

Provide:
1. Main themes (3-5 points)
2. Key insights
3. Common conclusions

Summary:"""

        synthesis = self.llm.generate_response(prompt=prompt)
        return synthesis, articles

# Usage
aggregator = ContentAggregator()

urls = [
    "https://techblog.example.com/ai-article-1",
    "https://news.example.com/ai-article-2",
    "https://research.example.com/ai-article-3"
]

synthesis, articles = aggregator.aggregate_articles(
    urls,
    topic="AI developments in 2024"
)

print("\nSynthesis:")
print(synthesis)
print(f"\nBased on {len(articles)} articles totaling {sum(a['word_count'] for a in articles):,} words")

Best Practices

1. Error Handling

Always wrap load_content() in try-except blocks. URLs may be unreachable, files may be corrupted, or formats may be unsupported.

2. Check Content Size

Use doc.word_count to check content size before sending to LLMs. Large documents may need chunking.

3. Cache Loaded Content

Loading content from URLs or parsing large PDFs can be slow. Cache the results if you'll use them multiple times.

4. Validate URLs

When loading from URLs, validate that they're accessible and contain the expected content type before processing.

5. Clean Extracted Text

PDF and web extraction may include formatting artifacts. Consider cleaning the text before using it with LLMs.

Error Handling

from SimplerLLM.tools.generic_loader import load_content

def safe_load_content(source):
    """Safely load content with comprehensive error handling"""
    try:
        doc = load_content(source)

        # Validate loaded content
        if not doc.content or len(doc.content.strip()) == 0:
            print(f"Warning: {source} loaded but contains no content")
            return None

        if doc.word_count < 10:
            print(f"Warning: {source} has very little content ({doc.word_count} words)")

        return doc

    except ValueError as e:
        print(f"Invalid input or unsupported format: {e}")
        return None

    except FileNotFoundError:
        print(f"File not found: {source}")
        return None

    except PermissionError:
        print(f"Permission denied accessing: {source}")
        return None

    except Exception as e:
        print(f"Unexpected error loading {source}: {e}")
        return None

# Usage
sources = ["article.pdf", "https://example.com/blog", "missing.txt"]

for source in sources:
    doc = safe_load_content(source)
    if doc:
        print(f"✓ Loaded {source}: {doc.word_count} words")

Performance Considerations

Loading Speed by Format

  • Text Files: Very fast (<1ms for small files)
  • Word Documents: Fast (5-50ms)
  • PDFs: Moderate (50-500ms depending on size)
  • Web URLs: Slow (1-5 seconds due to network latency)
  • CSV Files: Fast to moderate (depends on file size)

Next Steps

💡 Pro Tip

Combine content loading with text chunking and embeddings to build powerful RAG systems. Load content from any source, chunk it appropriately, generate embeddings, and store in a vector database for semantic search.