Updated: 2025-01-04
Created: 2025-01
I have a little script, the job of which is to create a lot of very small files (~1 million files, typically ~50-100bytes each). [ ... ] It's a bit of a one-off (or twice, maybe) script, and currently due to finish in about 15 hours. [...] Would rather take the chance to maybe learn something useful about tuning [the filesystem]...
First, I have appended two little Perl scripts (each rather small), one creates a Berkeley DB database of K records of random length varying between I and J bytes, the second does N accesses at random in that database. [...]
---------------------------------------------------------------- $ time perl megamake.pl /var/tmp/db 1000000 50 100 real 6m28.947s user 0m35.860s sys 0m45.530s ---------------------------------------------------------------- $ ls -sd /var/tmp/db* 130604 /var/tmp/db ----------------------------------------------------------------
dbm,
sqlite3 (also shelve which is a type
of dbm), h5py.# No error handling...
import re
import dbm.gnu
cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*')
dbname = 'database.gdbm'
commint = 19
with dbm.gnu.open(dbname,'nf') as d:
t = 0
with open('/etc/protocols','r') as p:
for line in p:
cols_ma = cols_re.match(line)
if cols_ma is not None:
pname = cols_ma.group(1)
pnumber = cols_ma.group(2)
d[pname.encode()] = pnumber.encode()
t = t+1
if t == commint:
d.sync()
t = 0
if t > 0:
d.sync()
with dbm.gnu.open(dbname,'ru') as d:
pname = d.firstkey()
while pname is not None:
pnumber = d[pname]
print("{}: {}".format(pname.decode(),pnumber.decode()))
pname = d.nextkey(pname)
# No error handling...
import re
import sqlite3
cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*')
dbname = 'database.sqlite3'
commint = 19
d = sqlite3.connect(dbname, autocommit=False)
c = d.cursor()
r = c.execute("""DROP TABLE IF EXISTS protocols""")
r = c.execute("""CREATE TABLE protocols(name,number)""")
t = 0
with open('/etc/protocols','r') as p:
for line in p:
cols_ma = cols_re.match(line)
if cols_ma is not None:
pname = cols_ma.group(1)
pnumber = cols_ma.group(2)
r = c.execute("""INSERT INTO protocols VALUES (?,?)""",(pname,pnumber))
t = t+1
if t == commint:
r = d.commit()
t = 0
if t > 0:
r = d.commit()
c.close()
d.close()
d = sqlite3.connect(dbname, autocommit=False)
c = d.cursor()
r = c.execute("""SELECT p.name,p.number FROM protocols AS p ORDER BY p.name""")
for (pname,pnumber) in r:
print("{}: {}".format(pname,pnumber))
c.close()
d.close()
# No error handling...
DBNAME='database.sqlu'
DTABLE='protocols'
sqlite-utils create-database "$DBNAME"
(
echo 'name'"$TAB"'number'
grep -v -E '^$|^#' /etc/protocols \
| sed 's/'"$TAB"'\+\| \+/'"$TAB"'/g' | cut -d"$TAB" -f 1,2 \
) \
| sqlite-utils insert "$DBNAME" "$DTABLE" --tsv -
sqlite-utils rows "$DBNAME" "$DTABLE" --tsv
# No error handling...
import re
import h5py
dbname = 'database.h5'
dsname = 'protocols'
cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*')
l = []
n = 0
with open('/etc/protocols','r') as p:
for line in p:
cols_ma = cols_re.match(line)
if cols_ma is not None:
pname = cols_ma.group(1)
pnumber = cols_ma.group(2)
l.append([pname.encode(),pnumber.encode()])
n = n+1
with h5py.File(dbname,"w") as d:
s = d.create_dataset(dsname,dtype=h5py.string_dtype(),shape=(n,2))
s[:] = l
d.flush()
with h5py.File(dbname,"r") as d:
s = d[dsname]
for (pname,pnumber) in s:
print("{}: {}".format(pname.decode(),pnumber.decode()))
# No error handling...
import re
import h5py
cols_re = re.compile('([-a-z0-9]+)[ \t ]+([0-9]+).*')
commint = 19
dbname = 'database_2.h5'
grname = 'protocols'
with h5py.File(dbname,"w") as d:
g = d.create_group(grname)
n = 0
t = 0
with open('/etc/protocols','r') as p:
for line in p:
cols_ma = cols_re.match(line)
if cols_ma is not None:
s = g.create_dataset("line{:08}".format(n),dtype=h5py.string_dtype(),shape=(2,))
pname = cols_ma.group(1)
pnumber = cols_ma.group(2)
s[:] = [pname.encode(),pnumber.encode()]
n = n+1
t = t+1
if t == commint:
d.flush()
t = 0
if t > 0:
d.flush()
with h5py.File(dbname,"r") as d:
g = d[grname]
for dsname in g:
(pname,pnumber) = g[dsname]
print("{}/{} -> {}: {}".format(grname,dsname, pname.decode(),pnumber.decode()))
$ ls -ld /etc/protocols database* -rw-r--r-- 2 root root 7413 2007-09-08 17:43 /etc/protocols -rw-r--r-- 1 pg pg 16384 2025-02-05 13:28 database.gdbm -rw-r--r-- 1 pg pg 14720 2025-02-05 13:28 database.h5 -rw-r--r-- 1 pg pg 8192 2025-02-05 18:52 database.sqlite3 -rw-r--r-- 1 pg pg 16384 2025-02-05 18:48 database.sqlu -rw-r--r-- 1 pg pg 71168 2025-02-05 13:29 database_2.h5
$ h5dump database.h5 | head -n 15
HDF5 "database.h5" {
GROUP "/" {
DATASET "protocols" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 140, 2 ) / ( 140, 2 ) }
DATA {
(0,0): "ip", "0",
(1,0): "hopopt", "0",
(2,0): "icmp", "1",
(3,0): "igmp", "2",
$ h5dump database_2.h5 | head -n 20
HDF5 "database_2.h5" {
GROUP "/" {
GROUP "protocols" {
DATASET "line00000000" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 2 ) / ( 2 ) }
DATA {
(0): "ip", "0"
}
}
DATASET "line00000001" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;