Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """A dumb and slow but simple dbm clone. |
2 | ||
3 | For database spam, spam.dir contains the index (a text file), | |
4 | spam.bak *may* contain a backup of the index (also a text file), | |
5 | while spam.dat contains the data (a binary file). | |
6 | ||
7 | XXX TO DO: | |
8 | ||
9 | - seems to contain a bug when updating... | |
10 | ||
11 | - reclaim free space (currently, space once occupied by deleted or expanded | |
12 | items is never reused) | |
13 | ||
14 | - support concurrent access (currently, if two processes take turns making | |
15 | updates, they can mess up the index) | |
16 | ||
17 | - support efficient access to large databases (currently, the whole index | |
18 | is read when the database is opened, and some updates rewrite the whole index) | |
19 | ||
20 | - support opening for read-only (flag = 'm') | |
21 | ||
22 | """ | |
23 | ||
24 | import os as _os | |
25 | import __builtin__ | |
26 | import UserDict | |
27 | ||
28 | _open = __builtin__.open | |
29 | ||
30 | _BLOCKSIZE = 512 | |
31 | ||
32 | error = IOError # For anydbm | |
33 | ||
34 | class _Database(UserDict.DictMixin): | |
35 | ||
36 | # The on-disk directory and data files can remain in mutually | |
37 | # inconsistent states for an arbitrarily long time (see comments | |
38 | # at the end of __setitem__). This is only repaired when _commit() | |
39 | # gets called. One place _commit() gets called is from __del__(), | |
40 | # and if that occurs at program shutdown time, module globals may | |
41 | # already have gotten rebound to None. Since it's crucial that | |
42 | # _commit() finish successfully, we can't ignore shutdown races | |
43 | # here, and _commit() must not reference any globals. | |
44 | _os = _os # for _commit() | |
45 | _open = _open # for _commit() | |
46 | ||
47 | def __init__(self, filebasename, mode): | |
48 | self._mode = mode | |
49 | ||
50 | # The directory file is a text file. Each line looks like | |
51 | # "%r, (%d, %d)\n" % (key, pos, siz) | |
52 | # where key is the string key, pos is the offset into the dat | |
53 | # file of the associated value's first byte, and siz is the number | |
54 | # of bytes in the associated value. | |
55 | self._dirfile = filebasename + _os.extsep + 'dir' | |
56 | ||
57 | # The data file is a binary file pointed into by the directory | |
58 | # file, and holds the values associated with keys. Each value | |
59 | # begins at a _BLOCKSIZE-aligned byte offset, and is a raw | |
60 | # binary 8-bit string value. | |
61 | self._datfile = filebasename + _os.extsep + 'dat' | |
62 | self._bakfile = filebasename + _os.extsep + 'bak' | |
63 | ||
64 | # The index is an in-memory dict, mirroring the directory file. | |
65 | self._index = None # maps keys to (pos, siz) pairs | |
66 | ||
67 | # Mod by Jack: create data file if needed | |
68 | try: | |
69 | f = _open(self._datfile, 'r') | |
70 | except IOError: | |
71 | f = _open(self._datfile, 'w', self._mode) | |
72 | f.close() | |
73 | self._update() | |
74 | ||
75 | # Read directory file into the in-memory index dict. | |
76 | def _update(self): | |
77 | self._index = {} | |
78 | try: | |
79 | f = _open(self._dirfile) | |
80 | except IOError: | |
81 | pass | |
82 | else: | |
83 | for line in f: | |
84 | line = line.rstrip() | |
85 | key, pos_and_siz_pair = eval(line) | |
86 | self._index[key] = pos_and_siz_pair | |
87 | f.close() | |
88 | ||
89 | # Write the index dict to the directory file. The original directory | |
90 | # file (if any) is renamed with a .bak extension first. If a .bak | |
91 | # file currently exists, it's deleted. | |
92 | def _commit(self): | |
93 | # CAUTION: It's vital that _commit() succeed, and _commit() can | |
94 | # be called from __del__(). Therefore we must never reference a | |
95 | # global in this routine. | |
96 | if self._index is None: | |
97 | return # nothing to do | |
98 | ||
99 | try: | |
100 | self._os.unlink(self._bakfile) | |
101 | except self._os.error: | |
102 | pass | |
103 | ||
104 | try: | |
105 | self._os.rename(self._dirfile, self._bakfile) | |
106 | except self._os.error: | |
107 | pass | |
108 | ||
109 | f = self._open(self._dirfile, 'w', self._mode) | |
110 | for key, pos_and_siz_pair in self._index.iteritems(): | |
111 | f.write("%r, %r\n" % (key, pos_and_siz_pair)) | |
112 | f.close() | |
113 | ||
114 | sync = _commit | |
115 | ||
116 | def __getitem__(self, key): | |
117 | pos, siz = self._index[key] # may raise KeyError | |
118 | f = _open(self._datfile, 'rb') | |
119 | f.seek(pos) | |
120 | dat = f.read(siz) | |
121 | f.close() | |
122 | return dat | |
123 | ||
124 | # Append val to the data file, starting at a _BLOCKSIZE-aligned | |
125 | # offset. The data file is first padded with NUL bytes (if needed) | |
126 | # to get to an aligned offset. Return pair | |
127 | # (starting offset of val, len(val)) | |
128 | def _addval(self, val): | |
129 | f = _open(self._datfile, 'rb+') | |
130 | f.seek(0, 2) | |
131 | pos = int(f.tell()) | |
132 | npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE | |
133 | f.write('\0'*(npos-pos)) | |
134 | pos = npos | |
135 | f.write(val) | |
136 | f.close() | |
137 | return (pos, len(val)) | |
138 | ||
139 | # Write val to the data file, starting at offset pos. The caller | |
140 | # is responsible for ensuring that there's enough room starting at | |
141 | # pos to hold val, without overwriting some other value. Return | |
142 | # pair (pos, len(val)). | |
143 | def _setval(self, pos, val): | |
144 | f = _open(self._datfile, 'rb+') | |
145 | f.seek(pos) | |
146 | f.write(val) | |
147 | f.close() | |
148 | return (pos, len(val)) | |
149 | ||
150 | # key is a new key whose associated value starts in the data file | |
151 | # at offset pos and with length siz. Add an index record to | |
152 | # the in-memory index dict, and append one to the directory file. | |
153 | def _addkey(self, key, pos_and_siz_pair): | |
154 | self._index[key] = pos_and_siz_pair | |
155 | f = _open(self._dirfile, 'a', self._mode) | |
156 | f.write("%r, %r\n" % (key, pos_and_siz_pair)) | |
157 | f.close() | |
158 | ||
159 | def __setitem__(self, key, val): | |
160 | if not type(key) == type('') == type(val): | |
161 | raise TypeError, "keys and values must be strings" | |
162 | if key not in self._index: | |
163 | self._addkey(key, self._addval(val)) | |
164 | else: | |
165 | # See whether the new value is small enough to fit in the | |
166 | # (padded) space currently occupied by the old value. | |
167 | pos, siz = self._index[key] | |
168 | oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE | |
169 | newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE | |
170 | if newblocks <= oldblocks: | |
171 | self._index[key] = self._setval(pos, val) | |
172 | else: | |
173 | # The new value doesn't fit in the (padded) space used | |
174 | # by the old value. The blocks used by the old value are | |
175 | # forever lost. | |
176 | self._index[key] = self._addval(val) | |
177 | ||
178 | # Note that _index may be out of synch with the directory | |
179 | # file now: _setval() and _addval() don't update the directory | |
180 | # file. This also means that the on-disk directory and data | |
181 | # files are in a mutually inconsistent state, and they'll | |
182 | # remain that way until _commit() is called. Note that this | |
183 | # is a disaster (for the database) if the program crashes | |
184 | # (so that _commit() never gets called). | |
185 | ||
186 | def __delitem__(self, key): | |
187 | # The blocks used by the associated value are lost. | |
188 | del self._index[key] | |
189 | # XXX It's unclear why we do a _commit() here (the code always | |
190 | # XXX has, so I'm not changing it). _setitem__ doesn't try to | |
191 | # XXX keep the directory file in synch. Why should we? Or | |
192 | # XXX why shouldn't __setitem__? | |
193 | self._commit() | |
194 | ||
195 | def keys(self): | |
196 | return self._index.keys() | |
197 | ||
198 | def has_key(self, key): | |
199 | return key in self._index | |
200 | ||
201 | def __contains__(self, key): | |
202 | return key in self._index | |
203 | ||
204 | def iterkeys(self): | |
205 | return self._index.iterkeys() | |
206 | __iter__ = iterkeys | |
207 | ||
208 | def __len__(self): | |
209 | return len(self._index) | |
210 | ||
211 | def close(self): | |
212 | self._commit() | |
213 | self._index = self._datfile = self._dirfile = self._bakfile = None | |
214 | ||
215 | __del__ = close | |
216 | ||
217 | ||
218 | ||
219 | def open(file, flag=None, mode=0666): | |
220 | """Open the database file, filename, and return corresponding object. | |
221 | ||
222 | The flag argument, used to control how the database is opened in the | |
223 | other DBM implementations, is ignored in the dumbdbm module; the | |
224 | database is always opened for update, and will be created if it does | |
225 | not exist. | |
226 | ||
227 | The optional mode argument is the UNIX mode of the file, used only when | |
228 | the database has to be created. It defaults to octal code 0666 (and | |
229 | will be modified by the prevailing umask). | |
230 | ||
231 | """ | |
232 | # flag argument is currently ignored | |
233 | return _Database(file, mode) |