import duckdb
Example of Jupyter “magic command”:
%pwd
'/Users/gjanee-local/Desktop/meds/bren-meds213-spring-2024-class-data/week3'
To install DuckDB Python module:
# %pip install duckdb
- Create a connection and a cursor
= duckdb.connect("database.db") conn
conn
<duckdb.duckdb.DuckDBPyConnection at 0x1040abb70>
= conn.cursor() cur
Now let’s do something with our cursor
"SELECT * FROM Site LIMIT 5") cur.execute(
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
Now we want results… three ways of getting them. 1. All results at once
cur.fetchall()
[('barr',
'Barrow',
'Alaska, USA',
71.30000305175781,
-156.60000610351562,
220.39999389648438),
('burn',
'Burntpoint Creek',
'Ontario, Canada',
55.20000076293945,
-84.30000305175781,
63.0),
('bylo',
'Bylot Island',
'Nunavut, Canada',
73.19999694824219,
-80.0,
723.5999755859375),
('cakr',
'Cape Krusenstern',
'Alaska, USA',
67.0999984741211,
-163.5,
54.099998474121094),
('cari',
'Canning River Delta',
'Alaska, USA',
70.0999984741211,
-145.8000030517578,
722.0)]
Cursors don’t store anything, they just transfer queries to the database and get results back.
cur.fetchall()
[]
Always get tuples, even if you only request one column
"SELECT Nest_ID FROM Bird_nests LIMIT 10") cur.execute(
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
cur.fetchall()
[('14HPE1',),
('11eaba',),
('11eabaagc01',),
('11eabaagv01',),
('11eababbc02',),
('11eababsv01',),
('11eabaduh01',),
('11eabaduv01',),
('11eabarpc01',),
('11eabarpc02',)]
"SELECT Nest_ID FROM Bird_nests LIMIT 10")
cur.execute(0] for t in cur.fetchall()] [t[
['14HPE1',
'11eaba',
'11eabaagc01',
'11eabaagv01',
'11eababbc02',
'11eababsv01',
'11eabaduh01',
'11eabaduv01',
'11eabarpc01',
'11eabarpc02']
- Get the one result, or the next result
"SELECT COUNT(*) FROM Bird_nests")
cur.execute( cur.fetchall()
[(1547,)]
"SELECT COUNT(*) FROM Bird_nests")
cur.execute( cur.fetchone()
(1547,)
"SELECT COUNT(*) FROM Bird_nests")
cur.execute(0] cur.fetchone()[
1547
- Using an iterator - but DuckDB doesn’t support iterators :(
"SELECT Nest_ID FROM Bird_nests LIMIT 10")
cur.execute(for row in cur:
print(f"got {row[0]}")
TypeError: 'duckdb.duckdb.DuckDBPyConnection' object is not iterable
A workaround:
"SELECT Nest_ID FROM Bird_nests LIMIT 10")
cur.execute(while True:
= cur.fetchone()
row if row == None:
break
# do something with row
print(f"got nest ID {row[0]}")
got nest ID 14HPE1
got nest ID 11eaba
got nest ID 11eabaagc01
got nest ID 11eabaagv01
got nest ID 11eababbc02
got nest ID 11eababsv01
got nest ID 11eabaduh01
got nest ID 11eabaduv01
got nest ID 11eabarpc01
got nest ID 11eabarpc02
Can do things other than SELECT!
"CREATE TEMP TABLE temp_table AS
cur.execute( SELECT * FROM Bird_nests LIMIT 10")
SyntaxError: unterminated string literal (detected at line 1) (1747419494.py, line 1)
"""
cur.execute( CREATE TEMP TABLE temp_table AS
SELECT * FROM Bird_nests LIMIT 10
""")
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
"SELECT * FROM temp_table") cur.execute(
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
cur.fetchall()
[('b14.6',
2014,
'chur',
'14HPE1',
'sepl',
'vloverti',
datetime.date(2014, 6, 14),
None,
3,
None,
None),
('b11.7',
2011,
'eaba',
'11eaba',
'wrsa',
'bhill',
datetime.date(2011, 7, 10),
'searcher',
4,
None,
None),
('b11.6',
2011,
'eaba',
'11eabaagc01',
'amgp',
'dkessler',
datetime.date(2011, 6, 24),
'searcher',
4,
6.0,
'float'),
('b11.6',
2011,
'eaba',
'11eabaagv01',
'amgp',
'dkessler',
datetime.date(2011, 6, 25),
'searcher',
3,
3.0,
'float'),
('b11.6',
2011,
'eaba',
'11eababbc02',
'bbpl',
'dkessler',
datetime.date(2011, 6, 24),
'searcher',
4,
4.0,
'float'),
('b11.7',
2011,
'eaba',
'11eababsv01',
'wrsa',
'bhill',
datetime.date(2011, 7, 7),
'searcher',
4,
2.0,
'float'),
('b11.6',
2011,
'eaba',
'11eabaduh01',
'dunl',
'dkessler',
datetime.date(2011, 6, 28),
'searcher',
3,
2.0,
'float'),
('b11.6',
2011,
'eaba',
'11eabaduv01',
'dunl',
'dkessler',
datetime.date(2011, 6, 29),
'searcher',
4,
5.0,
'float'),
('b11.7',
2011,
'eaba',
'11eabarpc01',
'reph',
'bhill',
datetime.date(2011, 7, 8),
'searcher',
4,
4.0,
'float'),
('b11.7',
2011,
'eaba',
'11eabarpc02',
'reph',
'bhill',
datetime.date(2011, 7, 8),
'searcher',
3,
4.0,
'float')]
A note on fragility
For example: INSERT INTO Site VALUES (“abcd”, “Foo”, 35.7, 42.3, “?”)
A less fragile way of expressing the same thing: INSERT INTO Site (Code, Site_name, Latitude, Longitude, Something_else) VALUES (“abcd”, “Foo”, 35.7, 42.3, “?”)
In the same vein: SELECT * is fragile
"SELECT * FROM Site LIMIT 3")
cur.execute( cur.fetchall()
[('barr',
'Barrow',
'Alaska, USA',
71.30000305175781,
-156.60000610351562,
220.39999389648438),
('burn',
'Burntpoint Creek',
'Ontario, Canada',
55.20000076293945,
-84.30000305175781,
63.0),
('bylo',
'Bylot Island',
'Nunavut, Canada',
73.19999694824219,
-80.0,
723.5999755859375)]
A better, more robust way of coding the same thing:
"SELECT Site_name, Code, Latitude, Longitude FROM Site LIMIT 3")
cur.execute( cur.fetchall()
[('Barrow', 'barr', 71.30000305175781, -156.60000610351562),
('Burntpoint Creek', 'burn', 55.20000076293945, -84.30000305175781),
('Bylot Island', 'bylo', 73.19999694824219, -80.0)]
An extended example: Question we’re trying to answer: How many nests do we have for each species?
Approach: first get all species. Then execute a count query for each species.
A digression: string interpolation in Python
# The % method
= "My name is %s"
s print(s % "Greg")
= "My name is %s and the other teacher's name is %s"
s print(s % ("Greg", "Julien"))
# The new f-string method
= "Greg"
name print(f"My name is {name}")
# Third way
print("My name is {}".format("Greg"))
My name is Greg
My name is Greg and the other teacher's name is Julien
My name is Greg
My name is Greg
= """
query SELECT COUNT(*) FROM Bird_nests
WHERE Species = '%s'
"""
"SELECT Code FROM Species LIMIT 3")
cur.execute(for row in cur.fetchall(): # DuckDB workaround
= row[0]
code = query % code
prepared_query #print(prepared_query)
= conn.cursor()
cur2
cur2.execute(prepared_query)print(f"Species {code} has {cur2.fetchone()[0]} nests")
cur2.close()
Species agsq has 0 nests
Species amcr has 0 nests
Species amgp has 29 nests
The above Python interpolation is dangerous and has caused many database hacks! There’s a better way
= """
query SELECT COUNT(*) FROM Bird_nests
WHERE Species = ?
"""
"SELECT Code FROM Species LIMIT 3")
cur.execute(for row in cur.fetchall(): # DuckDB workaround
= row[0]
code # NOT NEEDED! prepared_query = query % code
#print(prepared_query)
= conn.cursor()
cur2 # <-- added argument here
cur2.execute(query, [code]) print(f"Species {code} has {cur2.fetchone()[0]} nests")
cur2.close()
Species agsq has 0 nests
Species amcr has 0 nests
Species amgp has 29 nests
Let’s illustrate the danger with a different example
= "TS"
abbrev = "Taylor Swift"
name """
cur.execute( INSERT INTO Personnel (Abbreviation, Name)
VALUES ('%s', '%s')
""" % (abbrev, name)
)
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
"SELECT * FROM Personnel")
cur.execute(-3:] cur.fetchall()[
[('emagnuson', 'Emily Magnuson'),
('mcorrell', 'Maureen Correll'),
('TS', 'Taylor Swift')]
= "CO"
abbrev = "Conan O'Brien"
name """
cur.execute( INSERT INTO Personnel (Abbreviation, Name)
VALUES ('%s', '%s')
""" % (abbrev, name)
)
ParserException: Parser Error: syntax error at or near "Brien"
"""
INSERT INTO Personnel (Abbreviation, Name)
VALUES ('%s', '%s')
""" % (abbrev, name)
"\n INSERT INTO Personnel (Abbreviation, Name)\n VALUES ('CO', 'Conan O'Brien')\n "
= "CO"
abbrev = "Conan O'Brien"
name """
cur.execute( INSERT INTO Personnel (Abbreviation, Name)
VALUES (?, ?)
""",
[abbrev, name])
<duckdb.duckdb.DuckDBPyConnection at 0x10406b6b0>
"SELECT * FROM Personnel")
cur.execute(-3:] cur.fetchall()[
[('mcorrell', 'Maureen Correll'),
('TS', 'Taylor Swift'),
('CO', "Conan O'Brien")]