This guide helps you migrate from using standalone scripts to the new sovereign level monolithic dense library.
The new library consolidates all functionality into a unified, object-oriented interface while maintaining backward compatibility with existing scripts.
# Old approach - separate scripts
python scripts/fetch-public-files.py
python scripts/process-pdfs.py
python scripts/generate-search-index.py
# New approach - unified library
from epstein_files import Hub
hub = Hub()
hub.fetch_public_files()
hub.process_documents()
hub.generate_search_index()
# Install in development mode
pip install -e .
# Or install normally
pip install .
Before:
# scripts/fetch-public-files.py
import requests
# ... manual implementation
After:
from epstein_files import Hub
hub = Hub()
results = hub.fetch_public_files(sources=["fbi_vault", "doj"])
print(f"Fetched {results['total_files']} files")
Before:
# scripts/process-pdfs.py
import pypdf
# ... manual implementation
After:
from epstein_files import Hub
hub = Hub()
results = hub.process_documents(enable_ocr=True)
print(f"Processed {results['total_processed']} documents")
Before:
# scripts/generate-search-index.py
import json
# ... manual implementation
After:
from epstein_files import Hub
hub = Hub()
results = hub.generate_search_index()
print(f"Indexed {results['total_documents']} documents")
export DATA_DIR=data
export ENABLE_OCR=true
from epstein_files import Hub
hub = Hub()
# Access configuration
data_dir = hub.config.get("data_dir")
# Modify configuration
hub.config.set("enable_ocr", True)
hub.config.set("max_workers", 8)
# Get all paths
paths = hub.config.get_paths()
import json
with open("data/file.json", "r") as f:
data = json.load(f)
from epstein_files import Hub
hub = Hub()
# Load data
data = hub.data.load_json("data/file.json")
# Save data
hub.data.save_json(data, "data/output.json")
# List files
files = hub.data.list_files("data/public_files", "*.pdf")
# Get statistics
stats = hub.data.get_statistics()
# Manual cache implementation required
from epstein_files import Hub
hub = Hub()
# Cache data
hub.cache.set("key", value, namespace="processing")
# Retrieve cached data
value = hub.cache.get("key", namespace="processing")
# Cache decorator
@hub.cache.cached(namespace="wikipedia", ttl=168)
def expensive_operation():
return result
# Get cache statistics
stats = hub.cache.get_stats()
#!/bin/bash
python scripts/fetch-public-files.py
python scripts/fetch-wikipedia-data.py
python scripts/process-pdfs.py
python scripts/generate-search-index.py
from epstein_files import Hub
hub = Hub()
# Run entire pipeline
results = hub.run_full_pipeline(force_refresh=False)
# Or use CLI
# epstein-hub pipeline
The library provides a command-line interface:
# Get system status
epstein-hub status
# Fetch public files
epstein-hub fetch
# Process documents
epstein-hub process
# Generate search index
epstein-hub index
# Run full pipeline
epstein-hub pipeline
# Cleanup
epstein-hub cleanup
# Use debug mode
epstein-hub status --debug
# Force refresh
epstein-hub fetch --force
All original scripts continue to work:
python scripts/fetch-public-files.py
python scripts/process-pdfs.py
python scripts/generate-search-index.py
You can migrate gradually:
from epstein_files import Hub
hub = Hub()
# Access subsystems directly
hub.public_files.fetch_fbi_vault()
hub.wikipedia.fetch_character_data("Name")
hub.pdf_processor.process_file("doc.pdf")
hub.search_indexer.build_index()
hub.agents.run_agent("pdf_analysis", task)
from epstein_files import Hub
# Automatic cleanup
with Hub() as hub:
hub.fetch_public_files()
hub.process_documents()
hub.generate_search_index()
# Cleanup happens automatically
from epstein_files import Hub
from pathlib import Path
# Custom config path
hub = Hub(config_path=Path(".env.custom"))
# Or modify after initialization
hub.config.set("data_dir", "custom_data")
hub.config.set("max_workers", 16)
hub.config.ensure_directories()
Before:
# my_script.py
import scripts.fetch_public_files as fetch
fetch.main()
After:
# my_script.py
from epstein_files import Hub
hub = Hub()
hub.fetch_public_files()
Before:
import os
data_dir = os.getenv("DATA_DIR", "data")
After:
from epstein_files import Hub
hub = Hub()
data_dir = hub.config.get("data_dir")
Before:
import json
from pathlib import Path
data_path = Path("data/file.json")
with open(data_path, "r") as f:
data = json.load(f)
After:
from epstein_files import Hub
hub = Hub()
data = hub.data.load_json("data/file.json")
# test_migration.py
from epstein_files import Hub
def test_basic_operations():
hub = Hub()
# Test status
status = hub.get_status()
assert status['config']['valid']
# Test operations
results = hub.fetch_public_files()
assert 'total_files' in results
results = hub.process_documents()
assert 'total_processed' in results
results = hub.generate_search_index()
assert 'total_documents' in results
# Solution: Install the library
pip install -e .
# Solution: Ensure .env file exists or create config
from epstein_files import Hub
hub = Hub()
hub.config.ensure_directories()
# Solution: Check PYTHONPATH
import sys
sys.path.insert(0, '/path/to/Hub_of_Epstein_Files_Directory')
from epstein_files import Hub
For help with migration:
docs/LIBRARY_DOCUMENTATION.md for full API referenceexamples/ directory for usage examplesThe new library provides:
Start migrating today to take advantage of these benefits!