import narwhals as nw
import pandas as pd
import polars as pl
# Create sample data in both formats
pandas_df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 40],
'salary': [50000, 60000, 75000, 90000],
'department': ['IT', 'HR', 'IT', 'Finance']
})
polars_df = pl.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 40],
'salary': [50000, 60000, 75000, 90000],
'department': ['IT', 'HR', 'IT', 'Finance']
})
# Single function works with both!
def compute_stats(df):
df = nw.from_native(df)
result = (
df
.filter(nw.col('age') >= 30)
.with_columns([
(nw.col('salary') * 1.15).alias('salary_projected')
])
.group_by('department')
.agg([
nw.col('salary_projected').mean().alias('avg_salary'),
nw.col('age').min().alias('min_age'),
nw.len().alias('count')
])
)
return nw.to_native(result)
# Works with pandas
print("Pandas result:")
print(compute_stats(pandas_df))
# Works with polars
print("\nPolars result:")
print(compute_stats(polars_df))Narwhals and Ibis Examples
Narwhals: Cross-Backend DataFrame Processing
Basic Operations
String Operations
import narwhals as nw
def clean_names(df):
df = nw.from_native(df)
result = df.with_columns([
nw.col('name').str.to_lowercase().alias('name_lower'),
nw.col('name').str.len().alias('name_length'),
nw.col('department').str.replace('IT', 'Technology').alias('dept_full')
])
return nw.to_native(result)
# Works with any backend
cleaned_pandas = clean_names(pandas_df)
cleaned_polars = clean_names(polars_df)Date Operations
import narwhals as nw
import pandas as pd
from datetime import datetime, timedelta
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=10, freq='D'),
'sales': [100, 150, 200, 175, 225, 250, 300, 275, 325, 350]
})
def analyze_dates(df):
df = nw.from_native(df)
result = df.with_columns([
nw.col('date').dt.year().alias('year'),
nw.col('date').dt.month().alias('month'),
nw.col('date').dt.day().alias('day'),
nw.col('date').dt.weekday().alias('weekday')
])
return nw.to_native(result)
result = analyze_dates(df)Ibis: Cross-Database Queries
Connecting to Different Backends
import ibis
# DuckDB (in-memory or file)
duckdb_con = ibis.duckdb.connect()
duckdb_file = ibis.duckdb.connect('analytics.ddb')
# SQLite
sqlite_con = ibis.sqlite.connect('data.db')
# PostgreSQL
postgres_con = ibis.postgres.connect(
host='localhost',
port=5432,
database='mydb',
user='user',
password='password'
)
# Read from files directly (via DuckDB)
con = ibis.duckdb.connect()
table = con.read_parquet('data.parquet')
csv_table = con.read_csv('data.csv')Basic Queries
import ibis
from ibis import _
con = ibis.duckdb.connect()
# Create a table from a DataFrame
import pandas as pd
df = pd.DataFrame({
'product': ['A', 'B', 'A', 'C', 'B', 'A'],
'quantity': [10, 20, 15, 30, 25, 12],
'price': [100, 200, 100, 300, 200, 100]
})
con.create_table('sales', df)
sales = con.table('sales')
# Query with Ibis
result = (
sales
.mutate(revenue=_.quantity * _.price)
.group_by('product')
.agg(
total_quantity=_.quantity.sum(),
total_revenue=_.revenue.sum(),
avg_price=_.price.mean()
)
.order_by(ibis.desc('total_revenue'))
)
# Execute and get pandas DataFrame
result.execute()Complex Aggregations
import ibis
from ibis import _
con = ibis.duckdb.connect()
# Sample e-commerce data
orders = con.read_csv('orders.csv')
analysis = (
orders
.filter(_.order_date >= '2024-01-01')
.mutate(
month=_.order_date.month(),
revenue=_.quantity * _.unit_price,
is_bulk=_.quantity > 100
)
.group_by(['month', 'category', 'is_bulk'])
.agg(
order_count=_.count(),
total_revenue=_.revenue.sum(),
avg_order_value=_.revenue.mean(),
unique_customers=_.customer_id.nunique()
)
.filter(_.total_revenue > 10000)
.order_by([_.month, ibis.desc(_.total_revenue)])
)
# View the generated SQL
print(analysis.compile())
# Execute
result = analysis.execute()Window Functions
import ibis
from ibis import _
con = ibis.duckdb.connect()
sales = con.table('daily_sales')
# Calculate running totals and rankings
result = (
sales
.mutate(
running_total=_.amount.sum().over(
ibis.window(order_by=_.date, rows=(None, 0))
),
rank=ibis.rank().over(
ibis.window(partition_by=_.category, order_by=ibis.desc(_.amount))
),
pct_of_category=_.amount / _.amount.sum().over(
ibis.window(partition_by=_.category)
)
)
.filter(_.rank <= 10) # Top 10 per category
)
result.execute()Joins Across Databases
import ibis
# Connect to different sources
duckdb_con = ibis.duckdb.connect('warehouse.ddb')
sqlite_con = ibis.sqlite.connect('operational.db')
# Get tables from different databases
customers = duckdb_con.table('customers')
orders = sqlite_con.table('orders')
# Join works across connections (data moved to DuckDB)
result = (
orders
.join(customers, orders.customer_id == customers.id)
.group_by(customers.country)
.agg(
total_orders=orders.count(),
total_revenue=orders.amount.sum()
)
)
result.execute()Combining Narwhals and Ibis
Full ETL Pipeline
import ibis
import narwhals as nw
import polars as pl
def etl_pipeline(source_path, dest_db, use_polars=True):
# EXTRACT: Read from file with Ibis
con = ibis.duckdb.connect()
raw_data = con.read_parquet(source_path).execute()
# Convert to desired backend
if use_polars:
raw_data = pl.from_pandas(raw_data)
# TRANSFORM: Use Narwhals for portable transformations
def transform(df):
df = nw.from_native(df)
return (
df
.filter(nw.col('status') == 'active')
.with_columns([
nw.col('created_at').dt.year().alias('year'),
nw.col('created_at').dt.month().alias('month'),
(nw.col('revenue') - nw.col('cost')).alias('profit')
])
.group_by(['year', 'month', 'category'])
.agg([
nw.col('profit').sum().alias('total_profit'),
nw.col('revenue').sum().alias('total_revenue'),
nw.len().alias('transaction_count')
])
.filter(nw.col('total_profit') > 0)
.to_native()
)
transformed = transform(raw_data)
# LOAD: Write back with Ibis
dest_con = ibis.connect(dest_db)
dest_con.create_table('monthly_summary', transformed, overwrite=True)
return f"Pipeline complete. Processed {len(transformed)} rows."
# Run with Polars backend
result = etl_pipeline('raw_data.parquet', 'analytics.ddb', use_polars=True)
print(result)Testing Across Backends
import narwhals as nw
import pandas as pd
import polars as pl
import pytest
def business_logic(df):
"""This function should work with any DataFrame backend"""
df = nw.from_native(df)
result = (
df
.filter(nw.col('amount') > 100)
.group_by('category')
.agg([
nw.col('amount').sum().alias('total'),
nw.col('amount').mean().alias('average')
])
.sort('total', descending=True)
)
return nw.to_native(result)
# Test data
data = {
'category': ['A', 'B', 'A', 'C', 'B'],
'amount': [150, 200, 120, 90, 180]
}
# Test with Pandas
pandas_result = business_logic(pd.DataFrame(data))
assert isinstance(pandas_result, pd.DataFrame)
# Test with Polars
polars_result = business_logic(pl.DataFrame(data))
assert isinstance(polars_result, pl.DataFrame)
# Results should be equivalent
pd.testing.assert_frame_equal(
pandas_result,
polars_result.to_pandas()
)
print("✓ Tests passed across all backends!")Performance Comparison
import narwhals as nw
import pandas as pd
import polars as pl
import time
# Create large dataset
n = 1_000_000
data = {
'id': range(n),
'value': [i * 2 for i in range(n)],
'category': ['A', 'B', 'C', 'D'] * (n // 4)
}
pandas_df = pd.DataFrame(data)
polars_df = pl.DataFrame(data)
def benchmark_operation(df, backend_name):
start = time.time()
df = nw.from_native(df)
result = (
df
.filter(nw.col('value') > 500000)
.group_by('category')
.agg([
nw.col('value').sum().alias('total'),
nw.col('value').mean().alias('average')
])
)
result = nw.to_native(result)
elapsed = time.time() - start
print(f"{backend_name}: {elapsed:.3f} seconds")
return result
print("Benchmark: Filter and aggregate 1M rows")
pandas_result = benchmark_operation(pandas_df, "Pandas")
polars_result = benchmark_operation(polars_df, "Polars")Narwhals adds minimal overhead while Polars itself is significantly faster than Pandas for most operations. The abstraction layer doesn’t prevent you from getting the performance benefits of modern backends.