Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Strptime-related classes and functions. |
2 | ||
3 | CLASSES: | |
4 | LocaleTime -- Discovers and stores locale-specific time information | |
5 | TimeRE -- Creates regexes for pattern matching a string of text containing | |
6 | time information | |
7 | ||
8 | FUNCTIONS: | |
9 | _getlang -- Figure out what language is being used for the locale | |
10 | strptime -- Calculates the time struct represented by the passed-in string | |
11 | ||
12 | """ | |
13 | import time | |
14 | import locale | |
15 | import calendar | |
16 | from re import compile as re_compile | |
17 | from re import IGNORECASE | |
18 | from re import escape as re_escape | |
19 | from datetime import date as datetime_date | |
20 | try: | |
21 | from thread import allocate_lock as _thread_allocate_lock | |
22 | except: | |
23 | from dummy_thread import allocate_lock as _thread_allocate_lock | |
24 | ||
25 | __author__ = "Brett Cannon" | |
26 | __email__ = "brett@python.org" | |
27 | ||
28 | __all__ = ['strptime'] | |
29 | ||
30 | def _getlang(): | |
31 | # Figure out what the current language is set to. | |
32 | return locale.getlocale(locale.LC_TIME) | |
33 | ||
34 | class LocaleTime(object): | |
35 | """Stores and handles locale-specific information related to time. | |
36 | ||
37 | ATTRIBUTES: | |
38 | f_weekday -- full weekday names (7-item list) | |
39 | a_weekday -- abbreviated weekday names (7-item list) | |
40 | f_month -- full month names (13-item list; dummy value in [0], which | |
41 | is added by code) | |
42 | a_month -- abbreviated month names (13-item list, dummy value in | |
43 | [0], which is added by code) | |
44 | am_pm -- AM/PM representation (2-item list) | |
45 | LC_date_time -- format string for date/time representation (string) | |
46 | LC_date -- format string for date representation (string) | |
47 | LC_time -- format string for time representation (string) | |
48 | timezone -- daylight- and non-daylight-savings timezone representation | |
49 | (2-item list of sets) | |
50 | lang -- Language used by instance (2-item tuple) | |
51 | """ | |
52 | ||
53 | def __init__(self): | |
54 | """Set all attributes. | |
55 | ||
56 | Order of methods called matters for dependency reasons. | |
57 | ||
58 | The locale language is set at the offset and then checked again before | |
59 | exiting. This is to make sure that the attributes were not set with a | |
60 | mix of information from more than one locale. This would most likely | |
61 | happen when using threads where one thread calls a locale-dependent | |
62 | function while another thread changes the locale while the function in | |
63 | the other thread is still running. Proper coding would call for | |
64 | locks to prevent changing the locale while locale-dependent code is | |
65 | running. The check here is done in case someone does not think about | |
66 | doing this. | |
67 | ||
68 | Only other possible issue is if someone changed the timezone and did | |
69 | not call tz.tzset . That is an issue for the programmer, though, | |
70 | since changing the timezone is worthless without that call. | |
71 | ||
72 | """ | |
73 | self.lang = _getlang() | |
74 | self.__calc_weekday() | |
75 | self.__calc_month() | |
76 | self.__calc_am_pm() | |
77 | self.__calc_timezone() | |
78 | self.__calc_date_time() | |
79 | if _getlang() != self.lang: | |
80 | raise ValueError("locale changed during initialization") | |
81 | ||
82 | def __pad(self, seq, front): | |
83 | # Add '' to seq to either the front (is True), else the back. | |
84 | seq = list(seq) | |
85 | if front: | |
86 | seq.insert(0, '') | |
87 | else: | |
88 | seq.append('') | |
89 | return seq | |
90 | ||
91 | def __calc_weekday(self): | |
92 | # Set self.a_weekday and self.f_weekday using the calendar | |
93 | # module. | |
94 | a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] | |
95 | f_weekday = [calendar.day_name[i].lower() for i in range(7)] | |
96 | self.a_weekday = a_weekday | |
97 | self.f_weekday = f_weekday | |
98 | ||
99 | def __calc_month(self): | |
100 | # Set self.f_month and self.a_month using the calendar module. | |
101 | a_month = [calendar.month_abbr[i].lower() for i in range(13)] | |
102 | f_month = [calendar.month_name[i].lower() for i in range(13)] | |
103 | self.a_month = a_month | |
104 | self.f_month = f_month | |
105 | ||
106 | def __calc_am_pm(self): | |
107 | # Set self.am_pm by using time.strftime(). | |
108 | ||
109 | # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that | |
110 | # magical; just happened to have used it everywhere else where a | |
111 | # static date was needed. | |
112 | am_pm = [] | |
113 | for hour in (01,22): | |
114 | time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0)) | |
115 | am_pm.append(time.strftime("%p", time_tuple).lower()) | |
116 | self.am_pm = am_pm | |
117 | ||
118 | def __calc_date_time(self): | |
119 | # Set self.date_time, self.date, & self.time by using | |
120 | # time.strftime(). | |
121 | ||
122 | # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of | |
123 | # overloaded numbers is minimized. The order in which searches for | |
124 | # values within the format string is very important; it eliminates | |
125 | # possible ambiguity for what something represents. | |
126 | time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) | |
127 | date_time = [None, None, None] | |
128 | date_time[0] = time.strftime("%c", time_tuple).lower() | |
129 | date_time[1] = time.strftime("%x", time_tuple).lower() | |
130 | date_time[2] = time.strftime("%X", time_tuple).lower() | |
131 | replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), | |
132 | (self.f_month[3], '%B'), (self.a_weekday[2], '%a'), | |
133 | (self.a_month[3], '%b'), (self.am_pm[1], '%p'), | |
134 | ('1999', '%Y'), ('99', '%y'), ('22', '%H'), | |
135 | ('44', '%M'), ('55', '%S'), ('76', '%j'), | |
136 | ('17', '%d'), ('03', '%m'), ('3', '%m'), | |
137 | # '3' needed for when no leading zero. | |
138 | ('2', '%w'), ('10', '%I')] | |
139 | replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone | |
140 | for tz in tz_values]) | |
141 | for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')): | |
142 | current_format = date_time[offset] | |
143 | for old, new in replacement_pairs: | |
144 | # Must deal with possible lack of locale info | |
145 | # manifesting itself as the empty string (e.g., Swedish's | |
146 | # lack of AM/PM info) or a platform returning a tuple of empty | |
147 | # strings (e.g., MacOS 9 having timezone as ('','')). | |
148 | if old: | |
149 | current_format = current_format.replace(old, new) | |
150 | time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0)) | |
151 | if '00' in time.strftime(directive, time_tuple): | |
152 | U_W = '%W' | |
153 | else: | |
154 | U_W = '%U' | |
155 | date_time[offset] = current_format.replace('11', U_W) | |
156 | self.LC_date_time = date_time[0] | |
157 | self.LC_date = date_time[1] | |
158 | self.LC_time = date_time[2] | |
159 | ||
160 | def __calc_timezone(self): | |
161 | # Set self.timezone by using time.tzname. | |
162 | # Do not worry about possibility of time.tzname[0] == timetzname[1] | |
163 | # and time.daylight; handle that in strptime . | |
164 | try: | |
165 | time.tzset() | |
166 | except AttributeError: | |
167 | pass | |
168 | no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) | |
169 | if time.daylight: | |
170 | has_saving = frozenset([time.tzname[1].lower()]) | |
171 | else: | |
172 | has_saving = frozenset() | |
173 | self.timezone = (no_saving, has_saving) | |
174 | ||
175 | ||
176 | class TimeRE(dict): | |
177 | """Handle conversion from format directives to regexes.""" | |
178 | ||
179 | def __init__(self, locale_time=None): | |
180 | """Create keys/values. | |
181 | ||
182 | Order of execution is important for dependency reasons. | |
183 | ||
184 | """ | |
185 | if locale_time: | |
186 | self.locale_time = locale_time | |
187 | else: | |
188 | self.locale_time = LocaleTime() | |
189 | base = super(TimeRE, self) | |
190 | base.__init__({ | |
191 | # The " \d" part of the regex is to make %c from ANSI C work | |
192 | 'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", | |
193 | 'H': r"(?P<H>2[0-3]|[0-1]\d|\d)", | |
194 | 'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])", | |
195 | 'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", | |
196 | 'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])", | |
197 | 'M': r"(?P<M>[0-5]\d|\d)", | |
198 | 'S': r"(?P<S>6[0-1]|[0-5]\d|\d)", | |
199 | 'U': r"(?P<U>5[0-3]|[0-4]\d|\d)", | |
200 | 'w': r"(?P<w>[0-6])", | |
201 | # W is set below by using 'U' | |
202 | 'y': r"(?P<y>\d\d)", | |
203 | #XXX: Does 'Y' need to worry about having less or more than | |
204 | # 4 digits? | |
205 | 'Y': r"(?P<Y>\d\d\d\d)", | |
206 | 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), | |
207 | 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), | |
208 | 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), | |
209 | 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), | |
210 | 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), | |
211 | 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone | |
212 | for tz in tz_names), | |
213 | 'Z'), | |
214 | '%': '%'}) | |
215 | base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) | |
216 | base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) | |
217 | base.__setitem__('x', self.pattern(self.locale_time.LC_date)) | |
218 | base.__setitem__('X', self.pattern(self.locale_time.LC_time)) | |
219 | ||
220 | def __seqToRE(self, to_convert, directive): | |
221 | """Convert a list to a regex string for matching a directive. | |
222 | ||
223 | Want possible matching values to be from longest to shortest. This | |
224 | prevents the possibility of a match occuring for a value that also | |
225 | a substring of a larger value that should have matched (e.g., 'abc' | |
226 | matching when 'abcdef' should have been the match). | |
227 | ||
228 | """ | |
229 | to_convert = sorted(to_convert, key=len, reverse=True) | |
230 | for value in to_convert: | |
231 | if value != '': | |
232 | break | |
233 | else: | |
234 | return '' | |
235 | regex = '|'.join(re_escape(stuff) for stuff in to_convert) | |
236 | regex = '(?P<%s>%s' % (directive, regex) | |
237 | return '%s)' % regex | |
238 | ||
239 | def pattern(self, format): | |
240 | """Return regex pattern for the format string. | |
241 | ||
242 | Need to make sure that any characters that might be interpreted as | |
243 | regex syntax are escaped. | |
244 | ||
245 | """ | |
246 | processed_format = '' | |
247 | # The sub() call escapes all characters that might be misconstrued | |
248 | # as regex syntax. Cannot use re.escape since we have to deal with | |
249 | # format directives (%m, etc.). | |
250 | regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") | |
251 | format = regex_chars.sub(r"\\\1", format) | |
252 | whitespace_replacement = re_compile('\s+') | |
253 | format = whitespace_replacement.sub('\s*', format) | |
254 | while '%' in format: | |
255 | directive_index = format.index('%')+1 | |
256 | processed_format = "%s%s%s" % (processed_format, | |
257 | format[:directive_index-1], | |
258 | self[format[directive_index]]) | |
259 | format = format[directive_index+1:] | |
260 | return "%s%s" % (processed_format, format) | |
261 | ||
262 | def compile(self, format): | |
263 | """Return a compiled re object for the format string.""" | |
264 | return re_compile(self.pattern(format), IGNORECASE) | |
265 | ||
266 | _cache_lock = _thread_allocate_lock() | |
267 | # DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock | |
268 | # first! | |
269 | _TimeRE_cache = TimeRE() | |
270 | _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache | |
271 | _regex_cache = {} | |
272 | ||
273 | def strptime(data_string, format="%a %b %d %H:%M:%S %Y"): | |
274 | """Return a time struct based on the input string and the format string.""" | |
275 | global _TimeRE_cache, _regex_cache | |
276 | _cache_lock.acquire() | |
277 | try: | |
278 | time_re = _TimeRE_cache | |
279 | locale_time = time_re.locale_time | |
280 | if _getlang() != locale_time.lang: | |
281 | _TimeRE_cache = TimeRE() | |
282 | _regex_cache = {} | |
283 | if len(_regex_cache) > _CACHE_MAX_SIZE: | |
284 | _regex_cache.clear() | |
285 | format_regex = _regex_cache.get(format) | |
286 | if not format_regex: | |
287 | format_regex = time_re.compile(format) | |
288 | _regex_cache[format] = format_regex | |
289 | finally: | |
290 | _cache_lock.release() | |
291 | found = format_regex.match(data_string) | |
292 | if not found: | |
293 | raise ValueError("time data did not match format: data=%s fmt=%s" % | |
294 | (data_string, format)) | |
295 | if len(data_string) != found.end(): | |
296 | raise ValueError("unconverted data remains: %s" % | |
297 | data_string[found.end():]) | |
298 | year = 1900 | |
299 | month = day = 1 | |
300 | hour = minute = second = 0 | |
301 | tz = -1 | |
302 | # Default to -1 to signify that values not known; not critical to have, | |
303 | # though | |
304 | week_of_year = -1 | |
305 | week_of_year_start = -1 | |
306 | # weekday and julian defaulted to -1 so as to signal need to calculate | |
307 | # values | |
308 | weekday = julian = -1 | |
309 | found_dict = found.groupdict() | |
310 | for group_key in found_dict.iterkeys(): | |
311 | # Directives not explicitly handled below: | |
312 | # c, x, X | |
313 | # handled by making out of other directives | |
314 | # U, W | |
315 | # worthless without day of the week | |
316 | if group_key == 'y': | |
317 | year = int(found_dict['y']) | |
318 | # Open Group specification for strptime() states that a %y | |
319 | #value in the range of [00, 68] is in the century 2000, while | |
320 | #[69,99] is in the century 1900 | |
321 | if year <= 68: | |
322 | year += 2000 | |
323 | else: | |
324 | year += 1900 | |
325 | elif group_key == 'Y': | |
326 | year = int(found_dict['Y']) | |
327 | elif group_key == 'm': | |
328 | month = int(found_dict['m']) | |
329 | elif group_key == 'B': | |
330 | month = locale_time.f_month.index(found_dict['B'].lower()) | |
331 | elif group_key == 'b': | |
332 | month = locale_time.a_month.index(found_dict['b'].lower()) | |
333 | elif group_key == 'd': | |
334 | day = int(found_dict['d']) | |
335 | elif group_key == 'H': | |
336 | hour = int(found_dict['H']) | |
337 | elif group_key == 'I': | |
338 | hour = int(found_dict['I']) | |
339 | ampm = found_dict.get('p', '').lower() | |
340 | # If there was no AM/PM indicator, we'll treat this like AM | |
341 | if ampm in ('', locale_time.am_pm[0]): | |
342 | # We're in AM so the hour is correct unless we're | |
343 | # looking at 12 midnight. | |
344 | # 12 midnight == 12 AM == hour 0 | |
345 | if hour == 12: | |
346 | hour = 0 | |
347 | elif ampm == locale_time.am_pm[1]: | |
348 | # We're in PM so we need to add 12 to the hour unless | |
349 | # we're looking at 12 noon. | |
350 | # 12 noon == 12 PM == hour 12 | |
351 | if hour != 12: | |
352 | hour += 12 | |
353 | elif group_key == 'M': | |
354 | minute = int(found_dict['M']) | |
355 | elif group_key == 'S': | |
356 | second = int(found_dict['S']) | |
357 | elif group_key == 'A': | |
358 | weekday = locale_time.f_weekday.index(found_dict['A'].lower()) | |
359 | elif group_key == 'a': | |
360 | weekday = locale_time.a_weekday.index(found_dict['a'].lower()) | |
361 | elif group_key == 'w': | |
362 | weekday = int(found_dict['w']) | |
363 | if weekday == 0: | |
364 | weekday = 6 | |
365 | else: | |
366 | weekday -= 1 | |
367 | elif group_key == 'j': | |
368 | julian = int(found_dict['j']) | |
369 | elif group_key in ('U', 'W'): | |
370 | week_of_year = int(found_dict[group_key]) | |
371 | if group_key == 'U': | |
372 | # U starts week on Sunday | |
373 | week_of_year_start = 6 | |
374 | else: | |
375 | # W starts week on Monday | |
376 | week_of_year_start = 0 | |
377 | elif group_key == 'Z': | |
378 | # Since -1 is default value only need to worry about setting tz if | |
379 | # it can be something other than -1. | |
380 | found_zone = found_dict['Z'].lower() | |
381 | for value, tz_values in enumerate(locale_time.timezone): | |
382 | if found_zone in tz_values: | |
383 | # Deal with bad locale setup where timezone names are the | |
384 | # same and yet time.daylight is true; too ambiguous to | |
385 | # be able to tell what timezone has daylight savings | |
386 | if (time.tzname[0] == time.tzname[1] and | |
387 | time.daylight and found_zone not in ("utc", "gmt")): | |
388 | break | |
389 | else: | |
390 | tz = value | |
391 | break | |
392 | # If we know the week of the year and what day of that week, we can figure | |
393 | # out the Julian day of the year | |
394 | # Calculations below assume 0 is a Monday | |
395 | if julian == -1 and week_of_year != -1 and weekday != -1: | |
396 | # Calculate how many days in week 0 | |
397 | first_weekday = datetime_date(year, 1, 1).weekday() | |
398 | preceeding_days = 7 - first_weekday | |
399 | if preceeding_days == 7: | |
400 | preceeding_days = 0 | |
401 | # Adjust for U directive so that calculations are not dependent on | |
402 | # directive used to figure out week of year | |
403 | if weekday == 6 and week_of_year_start == 6: | |
404 | week_of_year -= 1 | |
405 | # If a year starts and ends on a Monday but a week is specified to | |
406 | # start on a Sunday we need to up the week to counter-balance the fact | |
407 | # that with %W that first Monday starts week 1 while with %U that is | |
408 | # week 0 and thus shifts everything by a week | |
409 | if weekday == 0 and first_weekday == 0 and week_of_year_start == 6: | |
410 | week_of_year += 1 | |
411 | # If in week 0, then just figure out how many days from Jan 1 to day of | |
412 | # week specified, else calculate by multiplying week of year by 7, | |
413 | # adding in days in week 0, and the number of days from Monday to the | |
414 | # day of the week | |
415 | if week_of_year == 0: | |
416 | julian = 1 + weekday - first_weekday | |
417 | else: | |
418 | days_to_week = preceeding_days + (7 * (week_of_year - 1)) | |
419 | julian = 1 + days_to_week + weekday | |
420 | # Cannot pre-calculate datetime_date() since can change in Julian | |
421 | #calculation and thus could have different value for the day of the week | |
422 | #calculation | |
423 | if julian == -1: | |
424 | # Need to add 1 to result since first day of the year is 1, not 0. | |
425 | julian = datetime_date(year, month, day).toordinal() - \ | |
426 | datetime_date(year, 1, 1).toordinal() + 1 | |
427 | else: # Assume that if they bothered to include Julian day it will | |
428 | #be accurate | |
429 | datetime_result = datetime_date.fromordinal((julian - 1) + datetime_date(year, 1, 1).toordinal()) | |
430 | year = datetime_result.year | |
431 | month = datetime_result.month | |
432 | day = datetime_result.day | |
433 | if weekday == -1: | |
434 | weekday = datetime_date(year, month, day).weekday() | |
435 | return time.struct_time((year, month, day, | |
436 | hour, minute, second, | |
437 | weekday, julian, tz)) |