Leveraging External Query Engines
Leaving Python land
DuckDB
-- saved as gga_hex.sql
-- Install and load the delta extension
INSTALL delta;
LOAD delta;
-- MP's data is in a public bucket, create
-- simple secret w/ the correct region
CREATE SECRET (
TYPE s3,
REGION 'us-east-1'
);
-- Create some views for ease of reference later
CREATE VIEW mp_tasks AS
SELECT * FROM delta_scan('s3://materialsproject-parsed/core/tasks/');
CREATE VIEW mp_dos AS
SELECT * FROM delta_scan('s3://materialsproject-parsed/core/electronic-structure/total-dos/');
-- Store all hexagonal GGA+U task_ids
SET VARIABLE gga_hex_task_ids = (
SELECT list(task_id)
FROM (
SELECT task_id,
run_type,
symmetry.crystal_system AS crystal_system
FROM mp_tasks
WHERE run_type = 'GGA+U'
AND crystal_system = 'Hexagonal'
) AS hex_gga
);
-- Query DOS table, store results in local parquet file
COPY (
SELECT *
FROM mp_dos
WHERE identifier IN getvariable('gga_hex_task_ids')
) TO 'gga_hex_dos.parquet.zstd' (FORMAT parquet, COMPRESSION zstd);More Options
Last updated
Was this helpful?