from upsonic import Agent, Task, KnowledgeBase
from upsonic.loaders import HTMLLoader, HTMLLoaderConfig
from upsonic.embeddings import OpenAIEmbedding, OpenAIEmbeddingConfig
from upsonic.text_splitter import HTMLChunker, HTMLChunkingConfig
from upsonic.vectordb import ChromaProvider, ChromaConfig, ConnectionConfig, Mode
# Configure splitter
splitter_config = HTMLChunkingConfig(
chunk_size=512,
chunk_overlap=50,
split_on_tags=["h1", "h2", "h3", "p"],
preserve_whole_tags=["table", "pre"]
)
splitter = HTMLChunker(splitter_config)
# Setup KnowledgeBase
loader = HTMLLoader(HTMLLoaderConfig())
embedding = OpenAIEmbedding(OpenAIEmbeddingConfig())
vectordb = ChromaProvider(ChromaConfig(
collection_name="html_docs",
vector_size=1536,
connection=ConnectionConfig(mode=Mode.IN_MEMORY)
))
kb = KnowledgeBase(
sources=["https://example.com/article"],
embedding_provider=embedding,
vectordb=vectordb,
loaders=[loader],
splitters=[splitter]
)
# Query with Agent
agent = Agent("openai/gpt-4o")
task = Task("Extract main content", context=[kb])
result = agent.do(task)
print(result)