The Epstein Files Hub Sovereign Level Monolithic Dense Library provides a comprehensive, centralized interface for all operations related to managing, processing, and searching Epstein-related files and documentation.
This monolithic architecture provides:
The Hub class is the sovereign interface that orchestrates all operations:
from epstein_files import Hub
# Initialize the hub
hub = Hub()
# Run operations
hub.fetch_public_files()
hub.process_documents()
hub.generate_search_index()
# Run complete pipeline
results = hub.run_full_pipeline()
# Get system status
status = hub.get_status()
Central configuration management:
from epstein_files import ConfigManager
config = ConfigManager()
# Get configuration
data_dir = config.get("data_dir")
enable_ocr = config.get("enable_ocr")
# Set configuration
config.set("debug_mode", True)
# Get all paths
paths = config.get_paths()
Central data operations:
from epstein_files import DataManager
data = DataManager(config)
# Save files
data.save_file(content, filepath, metadata)
# Load files
content = data.load_file(filepath)
# List files
files = data.list_files(directory, "*.pdf")
# Get statistics
stats = data.get_statistics()
Intelligent caching system:
from epstein_files import CacheManager
cache = CacheManager(config)
# Cache operations
cache.set("key", value, namespace="data")
value = cache.get("key", namespace="data")
# Cache decorator
@cache.cached(namespace="processing", ttl=24)
def expensive_operation():
return result
# Cache statistics
stats = cache.get_stats()
epstein_files/
├── __init__.py # Main package
├── core/ # Core functionality
│ ├── hub.py # Central Hub
│ ├── config_manager.py
│ ├── data_manager.py
│ └── cache_manager.py
├── data/ # Data handling
│ ├── public_files.py
│ └── wikipedia.py
├── search/ # Search and indexing
│ └── indexer.py
├── processing/ # Document processing
│ └── pdf_processor.py
├── agents/ # AI agent coordination
│ └── agent_manager.py
└── utils/ # Utility functions
# Install the package
pip install -e .
# Or with setup.py
python setup.py install
from epstein_files import Hub
# Create hub instance
with Hub() as hub:
# Fetch public files
results = hub.fetch_public_files()
print(f"Fetched {results['total_files']} files")
# Process documents
results = hub.process_documents()
print(f"Processed {results['total_processed']} documents")
# Generate search index
results = hub.generate_search_index()
print(f"Indexed {results['total_documents']} documents")
# Get system status
status = hub.get_status()
print(f"System status: {status}")
from epstein_files import Hub
hub = Hub()
# Force refresh all data
results = hub.run_full_pipeline(force_refresh=True)
# Access subsystems directly
hub.public_files.fetch_fbi_vault()
hub.wikipedia.fetch_character_data("John Doe")
hub.pdf_processor.process_file("document.pdf")
hub.search_indexer.search("query text")
hub.agents.run_agent("pdf_analysis", {"file": "doc.pdf"})
# Cleanup
hub.cleanup()
The library uses environment variables and .env files for configuration:
# .env file
DATA_DIR=data
CACHE_DIR=cache
LOGS_DIR=logs
ENABLE_OCR=true
MAX_WORKERS=4
DEBUG=false
Or configure programmatically:
from epstein_files import ConfigManager
config = ConfigManager()
config.set("enable_ocr", True)
config.set("max_workers", 8)
config.ensure_directories()
Methods:
fetch_public_files(sources=None, force_refresh=False) - Fetch public filesfetch_wikipedia_data(force_refresh=False) - Fetch Wikipedia dataprocess_documents(input_dir=None, enable_ocr=None) - Process PDFsgenerate_search_index(force_rebuild=False) - Generate search indexrun_full_pipeline(force_refresh=False) - Run complete pipelineget_status() - Get system statuscleanup() - Clean up temporary filesMethods:
get(key, default=None) - Get configuration valueset(key, value) - Set configuration valueget_paths() - Get all directory pathsensure_directories() - Create all required directoriesvalidate() - Validate configurationMethods:
save_file(content, filepath, metadata=None) - Save file with metadataload_file(filepath, binary=False) - Load filelist_files(directory, pattern="*", recursive=True) - List filessave_json(data, filepath) - Save JSON dataload_json(filepath) - Load JSON dataget_statistics() - Get data statisticscleanup_temp_files() - Clean up temporary filesMethods:
get(key, namespace="default", default=None) - Get from cacheset(key, value, namespace="default", ttl=None) - Set in cachedelete(key, namespace="default") - Delete from cacheclear(namespace=None) - Clear cachecleanup_expired() - Clean up expired entriesget_stats() - Get cache statisticscached(namespace="default", ttl=None) - Cache decoratorfrom epstein_files import Hub
hub = Hub()
# Fetch only FBI Vault files
results = hub.fetch_public_files(sources=["fbi_vault"])
print(f"Fetched {results['total_files']} FBI Vault files")
from epstein_files import Hub
from pathlib import Path
hub = Hub()
# Process PDFs from specific directory
input_dir = Path("data/custom_pdfs")
results = hub.process_documents(input_dir=input_dir, enable_ocr=True)
print(f"Processed: {results['total_processed']}")
print(f"Failed: {results['total_failed']}")
from epstein_files import Hub
hub = Hub()
# Use cache decorator
@hub.cache.cached(namespace="custom", ttl=48)
def expensive_computation():
# Do expensive work
return result
# Get cache statistics
stats = hub.cache.get_stats()
print(f"Cache size: {stats['total_size_mb']} MB")
from epstein_files import Hub
hub = Hub()
# Run specific agent
task = {"file": "document.pdf", "operation": "analyze"}
result = hub.agents.run_agent("pdf_analysis", task)
# Get agent status
status = hub.agents.get_status()
print(f"Active agents: {status['active_agents']}")
# Run tests
pytest tests/
# Run with coverage
pytest --cov=epstein_files tests/
# Run specific test
pytest tests/test_hub.py
MIT License - See LICENSE file for details
See CONTRIBUTING.md for guidelines on contributing to this library.