Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Regexp-based split and replace using the obsolete regex module. |
2 | ||
3 | This module is only for backward compatibility. These operations | |
4 | are now provided by the new regular expression module, "re". | |
5 | ||
6 | sub(pat, repl, str): replace first occurrence of pattern in string | |
7 | gsub(pat, repl, str): replace all occurrences of pattern in string | |
8 | split(str, pat, maxsplit): split string using pattern as delimiter | |
9 | splitx(str, pat, maxsplit): split string using pattern as delimiter plus | |
10 | return delimiters | |
11 | """ | |
12 | ||
13 | import warnings | |
14 | warnings.warn("the regsub module is deprecated; please use re.sub()", | |
15 | DeprecationWarning) | |
16 | ||
17 | # Ignore further deprecation warnings about this module | |
18 | warnings.filterwarnings("ignore", "", DeprecationWarning, __name__) | |
19 | ||
20 | import regex | |
21 | ||
22 | __all__ = ["sub","gsub","split","splitx","capwords"] | |
23 | ||
24 | # Replace first occurrence of pattern pat in string str by replacement | |
25 | # repl. If the pattern isn't found, the string is returned unchanged. | |
26 | # The replacement may contain references \digit to subpatterns and | |
27 | # escaped backslashes. The pattern may be a string or an already | |
28 | # compiled pattern. | |
29 | ||
30 | def sub(pat, repl, str): | |
31 | prog = compile(pat) | |
32 | if prog.search(str) >= 0: | |
33 | regs = prog.regs | |
34 | a, b = regs[0] | |
35 | str = str[:a] + expand(repl, regs, str) + str[b:] | |
36 | return str | |
37 | ||
38 | ||
39 | # Replace all (non-overlapping) occurrences of pattern pat in string | |
40 | # str by replacement repl. The same rules as for sub() apply. | |
41 | # Empty matches for the pattern are replaced only when not adjacent to | |
42 | # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'. | |
43 | ||
44 | def gsub(pat, repl, str): | |
45 | prog = compile(pat) | |
46 | new = '' | |
47 | start = 0 | |
48 | first = 1 | |
49 | while prog.search(str, start) >= 0: | |
50 | regs = prog.regs | |
51 | a, b = regs[0] | |
52 | if a == b == start and not first: | |
53 | if start >= len(str) or prog.search(str, start+1) < 0: | |
54 | break | |
55 | regs = prog.regs | |
56 | a, b = regs[0] | |
57 | new = new + str[start:a] + expand(repl, regs, str) | |
58 | start = b | |
59 | first = 0 | |
60 | new = new + str[start:] | |
61 | return new | |
62 | ||
63 | ||
64 | # Split string str in fields separated by delimiters matching pattern | |
65 | # pat. Only non-empty matches for the pattern are considered, so e.g. | |
66 | # split('abc', '') returns ['abc']. | |
67 | # The optional 3rd argument sets the number of splits that are performed. | |
68 | ||
69 | def split(str, pat, maxsplit = 0): | |
70 | return intsplit(str, pat, maxsplit, 0) | |
71 | ||
72 | # Split string str in fields separated by delimiters matching pattern | |
73 | # pat. Only non-empty matches for the pattern are considered, so e.g. | |
74 | # split('abc', '') returns ['abc']. The delimiters are also included | |
75 | # in the list. | |
76 | # The optional 3rd argument sets the number of splits that are performed. | |
77 | ||
78 | ||
79 | def splitx(str, pat, maxsplit = 0): | |
80 | return intsplit(str, pat, maxsplit, 1) | |
81 | ||
82 | # Internal function used to implement split() and splitx(). | |
83 | ||
84 | def intsplit(str, pat, maxsplit, retain): | |
85 | prog = compile(pat) | |
86 | res = [] | |
87 | start = next = 0 | |
88 | splitcount = 0 | |
89 | while prog.search(str, next) >= 0: | |
90 | regs = prog.regs | |
91 | a, b = regs[0] | |
92 | if a == b: | |
93 | next = next + 1 | |
94 | if next >= len(str): | |
95 | break | |
96 | else: | |
97 | res.append(str[start:a]) | |
98 | if retain: | |
99 | res.append(str[a:b]) | |
100 | start = next = b | |
101 | splitcount = splitcount + 1 | |
102 | if (maxsplit and (splitcount >= maxsplit)): | |
103 | break | |
104 | res.append(str[start:]) | |
105 | return res | |
106 | ||
107 | ||
108 | # Capitalize words split using a pattern | |
109 | ||
110 | def capwords(str, pat='[^a-zA-Z0-9_]+'): | |
111 | words = splitx(str, pat) | |
112 | for i in range(0, len(words), 2): | |
113 | words[i] = words[i].capitalize() | |
114 | return "".join(words) | |
115 | ||
116 | ||
117 | # Internal subroutines: | |
118 | # compile(pat): compile a pattern, caching already compiled patterns | |
119 | # expand(repl, regs, str): expand \digit escapes in replacement string | |
120 | ||
121 | ||
122 | # Manage a cache of compiled regular expressions. | |
123 | # | |
124 | # If the pattern is a string a compiled version of it is returned. If | |
125 | # the pattern has been used before we return an already compiled | |
126 | # version from the cache; otherwise we compile it now and save the | |
127 | # compiled version in the cache, along with the syntax it was compiled | |
128 | # with. Instead of a string, a compiled regular expression can also | |
129 | # be passed. | |
130 | ||
131 | cache = {} | |
132 | ||
133 | def compile(pat): | |
134 | if type(pat) != type(''): | |
135 | return pat # Assume it is a compiled regex | |
136 | key = (pat, regex.get_syntax()) | |
137 | if key in cache: | |
138 | prog = cache[key] # Get it from the cache | |
139 | else: | |
140 | prog = cache[key] = regex.compile(pat) | |
141 | return prog | |
142 | ||
143 | ||
144 | def clear_cache(): | |
145 | global cache | |
146 | cache = {} | |
147 | ||
148 | ||
149 | # Expand \digit in the replacement. | |
150 | # Each occurrence of \digit is replaced by the substring of str | |
151 | # indicated by regs[digit]. To include a literal \ in the | |
152 | # replacement, double it; other \ escapes are left unchanged (i.e. | |
153 | # the \ and the following character are both copied). | |
154 | ||
155 | def expand(repl, regs, str): | |
156 | if '\\' not in repl: | |
157 | return repl | |
158 | new = '' | |
159 | i = 0 | |
160 | ord0 = ord('0') | |
161 | while i < len(repl): | |
162 | c = repl[i]; i = i+1 | |
163 | if c != '\\' or i >= len(repl): | |
164 | new = new + c | |
165 | else: | |
166 | c = repl[i]; i = i+1 | |
167 | if '0' <= c <= '9': | |
168 | a, b = regs[ord(c)-ord0] | |
169 | new = new + str[a:b] | |
170 | elif c == '\\': | |
171 | new = new + c | |
172 | else: | |
173 | new = new + '\\' + c | |
174 | return new | |
175 | ||
176 | ||
177 | # Test program, reads sequences "pat repl str" from stdin. | |
178 | # Optional argument specifies pattern used to split lines. | |
179 | ||
180 | def test(): | |
181 | import sys | |
182 | if sys.argv[1:]: | |
183 | delpat = sys.argv[1] | |
184 | else: | |
185 | delpat = '[ \t\n]+' | |
186 | while 1: | |
187 | if sys.stdin.isatty(): sys.stderr.write('--> ') | |
188 | line = sys.stdin.readline() | |
189 | if not line: break | |
190 | if line[-1] == '\n': line = line[:-1] | |
191 | fields = split(line, delpat) | |
192 | if len(fields) != 3: | |
193 | print 'Sorry, not three fields' | |
194 | print 'split:', repr(fields) | |
195 | continue | |
196 | [pat, repl, str] = split(line, delpat) | |
197 | print 'sub :', repr(sub(pat, repl, str)) | |
198 | print 'gsub:', repr(gsub(pat, repl, str)) |