Updated: 2025-01-04
Created: 2025-01
I have a little script, the job of which is to create a lot of very small files (~1 million files, typically ~50-100bytes each). [ ... ] It's a bit of a one-off (or twice, maybe) script, and currently due to finish in about 15 hours. [...] Would rather take the chance to maybe learn something useful about tuning [the filesystem]...
First, I have appended two little Perl scripts (each rather small), one creates a Berkeley DB database of K records of random length varying between I and J bytes, the second does N accesses at random in that database. [...]
---------------------------------------------------------------- $ time perl megamake.pl /var/tmp/db 1000000 50 100 real 6m28.947s user 0m35.860s sys 0m45.530s ---------------------------------------------------------------- $ ls -sd /var/tmp/db* 130604 /var/tmp/db ----------------------------------------------------------------
dbm
,
sqlite3
(also shelve
which is a type
of dbm), h5py.# No error handling... import re import dbm.gnu cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*') dbname = 'database.gdbm' commint = 19 with dbm.gnu.open(dbname,'nf') as d: t = 0 with open('/etc/protocols','r') as p: for line in p: cols_ma = cols_re.match(line) if cols_ma is not None: pname = cols_ma.group(1) pnumber = cols_ma.group(2) d[pname.encode()] = pnumber.encode() t = t+1 if t == commint: d.sync() t = 0 if t > 0: d.sync() with dbm.gnu.open(dbname,'ru') as d: pname = d.firstkey() while pname is not None: pnumber = d[pname] print("{}: {}".format(pname.decode(),pnumber.decode())) pname = d.nextkey(pname)
# No error handling... import re import sqlite3 cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*') dbname = 'database.sqlite3' commint = 19 d = sqlite3.connect(dbname, autocommit=False) c = d.cursor() r = c.execute("""DROP TABLE IF EXISTS protocols""") r = c.execute("""CREATE TABLE protocols(name,number)""") t = 0 with open('/etc/protocols','r') as p: for line in p: cols_ma = cols_re.match(line) if cols_ma is not None: pname = cols_ma.group(1) pnumber = cols_ma.group(2) r = c.execute("""INSERT INTO protocols VALUES (?,?)""",(pname,pnumber)) t = t+1 if t == commint: r = d.commit() t = 0 if t > 0: r = d.commit() c.close() d.close() d = sqlite3.connect(dbname, autocommit=False) c = d.cursor() r = c.execute("""SELECT p.name,p.number FROM protocols AS p ORDER BY p.name""") for (pname,pnumber) in r: print("{}: {}".format(pname,pnumber)) c.close() d.close()
# No error handling... DBNAME='database.sqlu' DTABLE='protocols' sqlite-utils create-database "$DBNAME" ( echo 'name'"$TAB"'number' grep -v -E '^$|^#' /etc/protocols \ | sed 's/'"$TAB"'\+\| \+/'"$TAB"'/g' | cut -d"$TAB" -f 1,2 \ ) \ | sqlite-utils insert "$DBNAME" "$DTABLE" --tsv - sqlite-utils rows "$DBNAME" "$DTABLE" --tsv
# No error handling... import re import h5py dbname = 'database.h5' dsname = 'protocols' cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*') l = [] n = 0 with open('/etc/protocols','r') as p: for line in p: cols_ma = cols_re.match(line) if cols_ma is not None: pname = cols_ma.group(1) pnumber = cols_ma.group(2) l.append([pname.encode(),pnumber.encode()]) n = n+1 with h5py.File(dbname,"w") as d: s = d.create_dataset(dsname,dtype=h5py.string_dtype(),shape=(n,2)) s[:] = l d.flush() with h5py.File(dbname,"r") as d: s = d[dsname] for (pname,pnumber) in s: print("{}: {}".format(pname.decode(),pnumber.decode()))
# No error handling... import re import h5py cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*') commint = 19 dbname = 'database_2.h5' grname = 'protocols' with h5py.File(dbname,"w") as d: g = d.create_group(grname) n = 0 t = 0 with open('/etc/protocols','r') as p: for line in p: cols_ma = cols_re.match(line) if cols_ma is not None: s = g.create_dataset("line{:08}".format(n),dtype=h5py.string_dtype(),shape=(2,)) pname = cols_ma.group(1) pnumber = cols_ma.group(2) s[:] = [pname.encode(),pnumber.encode()] n = n+1 t = t+1 if t == commint: d.flush() t = 0 if t > 0: d.flush() with h5py.File(dbname,"r") as d: g = d[grname] for dsname in g: (pname,pnumber) = g[dsname] print("{}/{} -> {}: {}".format(grname,dsname, pname.decode(),pnumber.decode()))
$ ls -ld /etc/protocols database* -rw-r--r-- 2 root root 7413 2007-09-08 17:43 /etc/protocols -rw-r--r-- 1 pg pg 16384 2025-02-05 13:28 database.gdbm -rw-r--r-- 1 pg pg 14720 2025-02-05 13:28 database.h5 -rw-r--r-- 1 pg pg 8192 2025-02-05 18:52 database.sqlite3 -rw-r--r-- 1 pg pg 16384 2025-02-05 18:48 database.sqlu -rw-r--r-- 1 pg pg 71168 2025-02-05 13:29 database_2.h5
$ h5dump database.h5 | head -n 15 HDF5 "database.h5" { GROUP "/" { DATASET "protocols" { DATATYPE H5T_STRING { STRSIZE H5T_VARIABLE; STRPAD H5T_STR_NULLTERM; CSET H5T_CSET_UTF8; CTYPE H5T_C_S1; } DATASPACE SIMPLE { ( 140, 2 ) / ( 140, 2 ) } DATA { (0,0): "ip", "0", (1,0): "hopopt", "0", (2,0): "icmp", "1", (3,0): "igmp", "2",
$ h5dump database_2.h5 | head -n 20 HDF5 "database_2.h5" { GROUP "/" { GROUP "protocols" { DATASET "line00000000" { DATATYPE H5T_STRING { STRSIZE H5T_VARIABLE; STRPAD H5T_STR_NULLTERM; CSET H5T_CSET_UTF8; CTYPE H5T_C_S1; } DATASPACE SIMPLE { ( 2 ) / ( 2 ) } DATA { (0): "ip", "0" } } DATASET "line00000001" { DATATYPE H5T_STRING { STRSIZE H5T_VARIABLE; STRPAD H5T_STR_NULLTERM; CSET H5T_CSET_UTF8;