Python databases instead of small files

Updated: 2025-01-04
Created: 2025-01

Small files vs. databases

Three main types: key-value, relational, tabular

References

Example of dbm.gnu

# No error handling...
import re
import dbm.gnu

cols_re = re.compile('([-a-z0-9]+)[ \t	]+([0-9]+).*')

dbname = 'database.gdbm'
commint = 19

with dbm.gnu.open(dbname,'nf') as d:
    t = 0
    with open('/etc/protocols','r') as p:
        for line in p:
            cols_ma = cols_re.match(line)
            if cols_ma is not None:
                pname = cols_ma.group(1)
                pnumber = cols_ma.group(2)
                d[pname.encode()] = pnumber.encode()
                t = t+1
                if t == commint:
                    d.sync()
                    t = 0
    if t > 0:
        d.sync()

with dbm.gnu.open(dbname,'ru') as d:
    pname = d.firstkey()
    while pname is not None:
        pnumber = d[pname]
        print("{}: {}".format(pname.decode(),pnumber.decode()))
        pname = d.nextkey(pname)

Example of sqlite3

# No error handling...
import re
import sqlite3

cols_re = re.compile('([-a-z0-9]+)[ \t	]+([0-9]+).*')

dbname = 'database.sqlite3'
commint = 19

d = sqlite3.connect(dbname, autocommit=False)
c = d.cursor()

r = c.execute("""DROP TABLE IF EXISTS protocols""")
r = c.execute("""CREATE TABLE protocols(name,number)""")

t  = 0
with open('/etc/protocols','r') as p:
    for line in p:
        cols_ma = cols_re.match(line)
        if cols_ma is not None:
            pname = cols_ma.group(1)
            pnumber = cols_ma.group(2)
            r = c.execute("""INSERT INTO protocols VALUES (?,?)""",(pname,pnumber))
            t = t+1
            if t == commint:
                r = d.commit()
                t = 0
if t > 0:
    r = d.commit()

c.close()
d.close()

d = sqlite3.connect(dbname, autocommit=False)
c = d.cursor()

r = c.execute("""SELECT p.name,p.number FROM protocols AS p ORDER BY p.name""")
for (pname,pnumber) in r:
    print("{}: {}".format(pname,pnumber))

c.close()
d.close()

Example of sqlite-utils

# No error handling...
DBNAME='database.sqlu'
DTABLE='protocols'

sqlite-utils create-database "$DBNAME"

(
  echo 'name'"$TAB"'number'

  grep -v -E '^$|^#' /etc/protocols \
    | sed 's/'"$TAB"'\+\|  \+/'"$TAB"'/g' | cut -d"$TAB" -f 1,2 \
) \
| sqlite-utils insert "$DBNAME" "$DTABLE" --tsv -

sqlite-utils rows "$DBNAME" "$DTABLE" --tsv

Example of h5py #1

# No error handling...
import re
import h5py

dbname = 'database.h5'
dsname = 'protocols'
cols_re = re.compile('([-a-z0-9]+)[ \t  ]+([0-9]+).*')

l = []
n = 0

with open('/etc/protocols','r') as p:
    for line in p:
        cols_ma = cols_re.match(line)
        if cols_ma is not None:
            pname = cols_ma.group(1)
            pnumber = cols_ma.group(2)
            l.append([pname.encode(),pnumber.encode()])
            n = n+1

with h5py.File(dbname,"w") as d:
    s = d.create_dataset(dsname,dtype=h5py.string_dtype(),shape=(n,2))
    s[:] = l
    d.flush()

with h5py.File(dbname,"r") as d:
    s = d[dsname]
    for (pname,pnumber) in s:
        print("{}: {}".format(pname.decode(),pnumber.decode()))

Example of h5py #2

# No error handling...
import re
import h5py

cols_re = re.compile('([-a-z0-9]+)[ \t	]+([0-9]+).*')
commint = 19

dbname = 'database_2.h5'
grname = 'protocols'

with h5py.File(dbname,"w") as d:
    g = d.create_group(grname)
    n = 0
    t = 0
    with open('/etc/protocols','r') as p:
        for line in p:
            cols_ma = cols_re.match(line)
            if cols_ma is not None:
                s = g.create_dataset("line{:08}".format(n),dtype=h5py.string_dtype(),shape=(2,))
                pname = cols_ma.group(1)
                pnumber = cols_ma.group(2)
                s[:] = [pname.encode(),pnumber.encode()]
                n = n+1
                t = t+1
                if t == commint:
                    d.flush()
                    t = 0  
    if t > 0:
        d.flush()

with h5py.File(dbname,"r") as d:
    g = d[grname]
    for dsname in g:
        (pname,pnumber) = g[dsname]
        print("{}/{} -> {}: {}".format(grname,dsname, pname.decode(),pnumber.decode()))

Sizes of databases