Skip to main content
Skip to main content

DataStore Factory Methods

DataStore provides over 20 factory methods to create instances from various data sources including local files, databases, cloud storage, and data lakes.

Universal URI Interface

The uri() method is the recommended universal entry point that auto-detects the source type:

from chdb.datastore import DataStore

# Local files
ds = DataStore.uri("data.csv")
ds = DataStore.uri("/path/to/data.parquet")

# Cloud storage
ds = DataStore.uri("s3://bucket/data.parquet?nosign=true")
ds = DataStore.uri("https://example.com/data.csv")

# Databases
ds = DataStore.uri("mysql://user:pass@host:3306/db/table")
ds = DataStore.uri("postgresql://user:pass@host:5432/db/table")

URI Syntax Reference

Source TypeURI FormatExample
Local filepath/to/filedata.csv, /abs/path/data.parquet
S3s3://bucket/paths3://mybucket/data.parquet?nosign=true
GCSgs://bucket/pathgs://mybucket/data.csv
Azureaz://container/pathaz://mycontainer/data.parquet
HTTP/HTTPShttps://urlhttps://example.com/data.csv
MySQLmysql://user:pass@host:port/db/tablemysql://root:pass@localhost:3306/mydb/users
PostgreSQLpostgresql://user:pass@host:port/db/tablepostgresql://postgres:pass@localhost:5432/mydb/users
SQLitesqlite:///path?table=namesqlite:///data.db?table=users
ClickHouseclickhouse://host:port/db/tableclickhouse://localhost:9000/default/hits

File Sources

from_file

Create DataStore from a local or remote file with automatic format detection.

DataStore.from_file(path, format=None, compression=None, **kwargs)

Parameters:

ParameterTypeDefaultDescription
pathstrrequiredFile path (local or URL)
formatstrNoneFile format (auto-detected if None)
compressionstrNoneCompression type (auto-detected if None)

Supported formats: CSV, TSV, Parquet, JSON, JSONLines, ORC, Avro, Arrow

Examples:

from chdb.datastore import DataStore

# Auto-detect format from extension
ds = DataStore.from_file("data.csv")
ds = DataStore.from_file("data.parquet")
ds = DataStore.from_file("data.json")

# Explicit format
ds = DataStore.from_file("data.txt", format="CSV")

# With compression
ds = DataStore.from_file("data.csv.gz", compression="gzip")

Pandas-Compatible Read Functions

from chdb import datastore as pd

# CSV files
ds = pd.read_csv("data.csv")
ds = pd.read_csv("data.csv", sep=";", header=0, nrows=1000)

# Parquet files (recommended for large datasets)
ds = pd.read_parquet("data.parquet")
ds = pd.read_parquet("data.parquet", columns=['col1', 'col2'])

# JSON files
ds = pd.read_json("data.json")
ds = pd.read_json("data.jsonl", lines=True)

# Excel files
ds = pd.read_excel("data.xlsx", sheet_name="Sheet1")

Cloud Storage

from_s3

Create DataStore from Amazon S3.

DataStore.from_s3(url, access_key_id=None, secret_access_key=None, format=None, **kwargs)

Parameters:

ParameterTypeDefaultDescription
urlstrrequiredS3 URL (s3://bucket/path)
access_key_idstrNoneAWS access key ID
secret_access_keystrNoneAWS secret access key
formatstrNoneFile format (auto-detected)

Examples:

from chdb.datastore import DataStore

# Anonymous access (public bucket)
ds = DataStore.from_s3("s3://bucket/data.parquet")

# With credentials
ds = DataStore.from_s3(
    "s3://bucket/data.parquet",
    access_key_id="AKIAIOSFODNN7EXAMPLE",
    secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
)

# Using URI with query parameters
ds = DataStore.uri("s3://bucket/data.parquet?nosign=true")
ds = DataStore.uri("s3://bucket/data.parquet?access_key_id=KEY&secret_access_key=SECRET")

from_gcs

Create DataStore from Google Cloud Storage.

DataStore.from_gcs(url, credentials_path=None, **kwargs)

Examples:

ds = DataStore.from_gcs("gs://bucket/data.parquet")
ds = DataStore.from_gcs("gs://bucket/data.parquet", credentials_path="/path/to/creds.json")

from_azure

Create DataStore from Azure Blob Storage.

DataStore.from_azure(url, account_name=None, account_key=None, **kwargs)

Examples:

ds = DataStore.from_azure(
    "az://container/data.parquet",
    account_name="myaccount",
    account_key="mykey"
)

from_hdfs

Create DataStore from HDFS.

DataStore.from_hdfs(url, **kwargs)

Examples:

ds = DataStore.from_hdfs("hdfs://namenode:8020/path/data.parquet")

from_url

Create DataStore from HTTP/HTTPS URL.

DataStore.from_url(url, format=None, **kwargs)

Examples:

ds = DataStore.from_url("https://example.com/data.csv")
ds = DataStore.from_url("https://raw.githubusercontent.com/user/repo/main/data.parquet")

Databases

from_mysql

Create DataStore from MySQL database.

DataStore.from_mysql(host, database, table, user, password, port=3306, **kwargs)

Parameters:

ParameterTypeDefaultDescription
hoststrrequiredMySQL host
databasestrrequiredDatabase name
tablestrrequiredTable name
userstrrequiredUsername
passwordstrrequiredPassword
portint3306Port number

Examples:

ds = DataStore.from_mysql(
    host="localhost",
    database="mydb",
    table="users",
    user="root",
    password="password"
)

# Using URI
ds = DataStore.uri("mysql://root:password@localhost:3306/mydb/users")

from_postgresql

Create DataStore from PostgreSQL database.

DataStore.from_postgresql(host, database, table, user, password, port=5432, **kwargs)

Examples:

ds = DataStore.from_postgresql(
    host="localhost",
    database="mydb",
    table="users",
    user="postgres",
    password="password"
)

# Using URI
ds = DataStore.uri("postgresql://postgres:password@localhost:5432/mydb/users")

from_clickhouse

Create DataStore from ClickHouse server.

DataStore.from_clickhouse(host, database, table, user=None, password=None, port=9000, **kwargs)

Examples:

ds = DataStore.from_clickhouse(
    host="localhost",
    database="default",
    table="hits",
    user="default",
    password=""
)

# Connection-level mode (explore databases)
ds = DataStore.from_clickhouse(
    host="analytics.company.com",
    user="analyst",
    password="secret"
)
ds.databases()                  # List databases
ds.tables("production")         # List tables
result = ds.sql("SELECT * FROM production.users LIMIT 10")

from_mongodb

Create DataStore from MongoDB.

DataStore.from_mongodb(uri, database, collection, **kwargs)

Examples:

ds = DataStore.from_mongodb(
    uri="mongodb://localhost:27017",
    database="mydb",
    collection="users"
)

from_sqlite

Create DataStore from SQLite database.

DataStore.from_sqlite(database_path, table, **kwargs)

Examples:

ds = DataStore.from_sqlite("data.db", table="users")

# Using URI
ds = DataStore.uri("sqlite:///data.db?table=users")

Data Lakes

from_iceberg

Create DataStore from Apache Iceberg table.

DataStore.from_iceberg(path, **kwargs)

Examples:

ds = DataStore.from_iceberg("/path/to/iceberg_table")
ds = DataStore.uri("iceberg://catalog/namespace/table")

from_delta

Create DataStore from Delta Lake table.

DataStore.from_delta(path, **kwargs)

Examples:

ds = DataStore.from_delta("/path/to/delta_table")
ds = DataStore.uri("deltalake:///path/to/delta_table")

from_hudi

Create DataStore from Apache Hudi table.

DataStore.from_hudi(path, **kwargs)

Examples:

ds = DataStore.from_hudi("/path/to/hudi_table")
ds = DataStore.uri("hudi:///path/to/hudi_table")

In-Memory Sources

from_df / from_dataframe

Create DataStore from pandas DataFrame.

DataStore.from_df(df, name=None)
DataStore.from_dataframe(df, name=None)  # alias

Examples:

import pandas
from chdb.datastore import DataStore

pdf = pandas.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
ds = DataStore.from_df(pdf)

DataFrame Constructor

Create DataStore using pandas-like constructor.

from chdb import datastore as pd

# From dictionary
ds = pd.DataFrame({
    'name': ['Alice', 'Bob'],
    'age': [25, 30]
})

# From pandas DataFrame
import pandas
pdf = pandas.DataFrame({'a': [1, 2, 3]})
ds = pd.DataFrame(pdf)

Special Sources

from_numbers

Create DataStore with sequential numbers (useful for testing).

DataStore.from_numbers(count, **kwargs)

Examples:

ds = DataStore.from_numbers(1000000)  # 1M rows with 'number' column
result = ds.filter(ds['number'] % 2 == 0).head(10)  # Even numbers

from_random

Create DataStore with random data.

DataStore.from_random(rows, columns, **kwargs)

Examples:

ds = DataStore.from_random(rows=1000, columns=5)

run_sql

Create DataStore from raw SQL query.

DataStore.run_sql(query)

Examples:

ds = DataStore.run_sql("""
    SELECT number, number * 2 as doubled
    FROM numbers(100)
    WHERE number % 10 = 0
""")

Summary Table

MethodSource TypeExample
uri()UniversalDataStore.uri("s3://bucket/data.parquet")
from_file()Local/Remote filesDataStore.from_file("data.csv")
read_csv()CSV filespd.read_csv("data.csv")
read_parquet()Parquet filespd.read_parquet("data.parquet")
from_s3()Amazon S3DataStore.from_s3("s3://bucket/path")
from_gcs()Google Cloud StorageDataStore.from_gcs("gs://bucket/path")
from_azure()Azure BlobDataStore.from_azure("az://container/path")
from_hdfs()HDFSDataStore.from_hdfs("hdfs://host/path")
from_url()HTTP/HTTPSDataStore.from_url("https://example.com/data.csv")
from_mysql()MySQLDataStore.from_mysql(host, db, table, user, pass)
from_postgresql()PostgreSQLDataStore.from_postgresql(host, db, table, user, pass)
from_clickhouse()ClickHouseDataStore.from_clickhouse(host, db, table)
from_mongodb()MongoDBDataStore.from_mongodb(uri, db, collection)
from_sqlite()SQLiteDataStore.from_sqlite("data.db", table)
from_iceberg()Apache IcebergDataStore.from_iceberg("/path/to/table")
from_delta()Delta LakeDataStore.from_delta("/path/to/table")
from_hudi()Apache HudiDataStore.from_hudi("/path/to/table")
from_df()pandas DataFrameDataStore.from_df(pandas_df)
DataFrame()Dictionary/DataFramepd.DataFrame({'a': [1, 2, 3]})
from_numbers()Sequential numbersDataStore.from_numbers(1000000)
from_random()Random dataDataStore.from_random(rows=1000, columns=5)
run_sql()Raw SQLDataStore.run_sql("SELECT * FROM ...")