Technology•February 28, 2024
GenAI Data Ingestion Just Got Easier with Unstructured.io and Astra DB
pip install "unstructured[all-docs]"
pip install "unstructured[astra]" pip install llama-index-embeddings-huggingface
from unstructured.partition.html import partition_html url = "https://www.datastax.com/pricing/astra-db" elements = partition_html(url=url) print("\n\n".join([str(el) for el in elements]))
import os from dotenv import load_dotenv from unstructured.ingest.runner.writers.base_writer import Writer from unstructured.ingest.runner.writers.astra import AstraWriter from unstructured.partition.html import partition_html load_dotenv() url = "https://www.datastax.com/pricing/astra-db" elements = partition_html(url=url) if not os.path.exists("local-input-to-astra"): os.makedirs("local-input-to-astra") for elem in elements: # Write the text to local txt files with open(f"local-input-to-astra/{elem.id}.txt", "w") as f: f.write(elem.text) from unstructured.ingest.connector.local import SimpleLocalConfig from unstructured.ingest.connector.astra import ( AstraAccessConfig, AstraWriteConfig, SimpleAstraConfig, ) from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, ProcessorConfig, ReadConfig, ) from unstructured.ingest.runner import LocalRunner from unstructured.ingest.runner.writers.base_writer import Writer from unstructured.ingest.runner.writers.astra import ( AstraWriter, ) def get_writer() -> Writer: return AstraWriter( connector_config=SimpleAstraConfig( access_config=AstraAccessConfig( api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), ), collection_name=os.getenv("ASTRA_DB_COLLECTION_NAME", "unstructured"), embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSION", 384), ), write_config=AstraWriteConfig(batch_size=20), ) writer = get_writer() runner = LocalRunner( processor_config=ProcessorConfig( verbose=True, output_dir="local-output-to-astra", num_processes=2, ), connector_config=SimpleLocalConfig( input_path="local-input-to-astra", ), read_config=ReadConfig(), partition_config=PartitionConfig(), chunking_config=ChunkingConfig(chunk_elements=True), embedding_config=EmbeddingConfig( provider="langchain-huggingface", ), writer=writer, writer_kwargs={}, ) runner.run()
from llama_index.core import VectorStoreIndex from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.astra import AstraDBVectorStore astra_db_store = AstraDBVectorStore( token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), collection_name=os.getenv("ASTRA_DB_COLLECTION_NAME", "unstructured"), embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSION", 384), ) index = VectorStoreIndex.from_vector_store( vector_store=astra_db_store, embed_model=HuggingFaceEmbedding( model_name="BAAI/bge-small-en-v1.5" ) ) query_engine = index.as_query_engine() response = query_engine.query( "how much is the astra db free tier?" ) print(response.response)
The Astra DB free tier provides $25 monthly credit for the first three months, allowing users to explore the service without incurring costs during this initial period.