Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | #! /usr/bin/env python |
2 | ||
3 | r"""Convert old ("regex") regular expressions to new syntax ("re"). | |
4 | ||
5 | When imported as a module, there are two functions, with their own | |
6 | strings: | |
7 | ||
8 | convert(s, syntax=None) -- convert a regex regular expression to re syntax | |
9 | ||
10 | quote(s) -- return a quoted string literal | |
11 | ||
12 | When used as a script, read a Python string literal (or any other | |
13 | expression evaluating to a string) from stdin, and write the | |
14 | translated expression to stdout as a string literal. Unless stdout is | |
15 | a tty, no trailing \n is written to stdout. This is done so that it | |
16 | can be used with Emacs C-U M-| (shell-command-on-region with argument | |
17 | which filters the region through the shell command). | |
18 | ||
19 | No attempt has been made at coding for performance. | |
20 | ||
21 | Translation table... | |
22 | ||
23 | \( ( (unless RE_NO_BK_PARENS set) | |
24 | \) ) (unless RE_NO_BK_PARENS set) | |
25 | \| | (unless RE_NO_BK_VBAR set) | |
26 | \< \b (not quite the same, but alla...) | |
27 | \> \b (not quite the same, but alla...) | |
28 | \` \A | |
29 | \' \Z | |
30 | ||
31 | Not translated... | |
32 | ||
33 | . | |
34 | ^ | |
35 | $ | |
36 | * | |
37 | + (unless RE_BK_PLUS_QM set, then to \+) | |
38 | ? (unless RE_BK_PLUS_QM set, then to \?) | |
39 | \ | |
40 | \b | |
41 | \B | |
42 | \w | |
43 | \W | |
44 | \1 ... \9 | |
45 | ||
46 | Special cases... | |
47 | ||
48 | Non-printable characters are always replaced by their 3-digit | |
49 | escape code (except \t, \n, \r, which use mnemonic escapes) | |
50 | ||
51 | Newline is turned into | when RE_NEWLINE_OR is set | |
52 | ||
53 | XXX To be done... | |
54 | ||
55 | [...] (different treatment of backslashed items?) | |
56 | [^...] (different treatment of backslashed items?) | |
57 | ^ $ * + ? (in some error contexts these are probably treated differently) | |
58 | \vDD \DD (in the regex docs but only works when RE_ANSI_HEX set) | |
59 | ||
60 | """ | |
61 | ||
62 | ||
63 | import warnings | |
64 | warnings.filterwarnings("ignore", ".* regex .*", DeprecationWarning, __name__, | |
65 | append=1) | |
66 | ||
67 | import regex | |
68 | from regex_syntax import * # RE_* | |
69 | ||
70 | __all__ = ["convert","quote"] | |
71 | ||
72 | # Default translation table | |
73 | mastertable = { | |
74 | r'\<': r'\b', | |
75 | r'\>': r'\b', | |
76 | r'\`': r'\A', | |
77 | r'\'': r'\Z', | |
78 | r'\(': '(', | |
79 | r'\)': ')', | |
80 | r'\|': '|', | |
81 | '(': r'\(', | |
82 | ')': r'\)', | |
83 | '|': r'\|', | |
84 | '\t': r'\t', | |
85 | '\n': r'\n', | |
86 | '\r': r'\r', | |
87 | } | |
88 | ||
89 | ||
90 | def convert(s, syntax=None): | |
91 | """Convert a regex regular expression to re syntax. | |
92 | ||
93 | The first argument is the regular expression, as a string object, | |
94 | just like it would be passed to regex.compile(). (I.e., pass the | |
95 | actual string object -- string quotes must already have been | |
96 | removed and the standard escape processing has already been done, | |
97 | e.g. by eval().) | |
98 | ||
99 | The optional second argument is the regex syntax variant to be | |
100 | used. This is an integer mask as passed to regex.set_syntax(); | |
101 | the flag bits are defined in regex_syntax. When not specified, or | |
102 | when None is given, the current regex syntax mask (as retrieved by | |
103 | regex.get_syntax()) is used -- which is 0 by default. | |
104 | ||
105 | The return value is a regular expression, as a string object that | |
106 | could be passed to re.compile(). (I.e., no string quotes have | |
107 | been added -- use quote() below, or repr().) | |
108 | ||
109 | The conversion is not always guaranteed to be correct. More | |
110 | syntactical analysis should be performed to detect borderline | |
111 | cases and decide what to do with them. For example, 'x*?' is not | |
112 | translated correctly. | |
113 | ||
114 | """ | |
115 | table = mastertable.copy() | |
116 | if syntax is None: | |
117 | syntax = regex.get_syntax() | |
118 | if syntax & RE_NO_BK_PARENS: | |
119 | del table[r'\('], table[r'\)'] | |
120 | del table['('], table[')'] | |
121 | if syntax & RE_NO_BK_VBAR: | |
122 | del table[r'\|'] | |
123 | del table['|'] | |
124 | if syntax & RE_BK_PLUS_QM: | |
125 | table['+'] = r'\+' | |
126 | table['?'] = r'\?' | |
127 | table[r'\+'] = '+' | |
128 | table[r'\?'] = '?' | |
129 | if syntax & RE_NEWLINE_OR: | |
130 | table['\n'] = '|' | |
131 | res = "" | |
132 | ||
133 | i = 0 | |
134 | end = len(s) | |
135 | while i < end: | |
136 | c = s[i] | |
137 | i = i+1 | |
138 | if c == '\\': | |
139 | c = s[i] | |
140 | i = i+1 | |
141 | key = '\\' + c | |
142 | key = table.get(key, key) | |
143 | res = res + key | |
144 | else: | |
145 | c = table.get(c, c) | |
146 | res = res + c | |
147 | return res | |
148 | ||
149 | ||
150 | def quote(s, quote=None): | |
151 | """Convert a string object to a quoted string literal. | |
152 | ||
153 | This is similar to repr() but will return a "raw" string (r'...' | |
154 | or r"...") when the string contains backslashes, instead of | |
155 | doubling all backslashes. The resulting string does *not* always | |
156 | evaluate to the same string as the original; however it will do | |
157 | just the right thing when passed into re.compile(). | |
158 | ||
159 | The optional second argument forces the string quote; it must be | |
160 | a single character which is a valid Python string quote. | |
161 | ||
162 | """ | |
163 | if quote is None: | |
164 | q = "'" | |
165 | altq = "'" | |
166 | if q in s and altq not in s: | |
167 | q = altq | |
168 | else: | |
169 | assert quote in ('"', "'") | |
170 | q = quote | |
171 | res = q | |
172 | for c in s: | |
173 | if c == q: c = '\\' + c | |
174 | elif c < ' ' or c > '~': c = "\\%03o" % ord(c) | |
175 | res = res + c | |
176 | res = res + q | |
177 | if '\\' in res: | |
178 | res = 'r' + res | |
179 | return res | |
180 | ||
181 | ||
182 | def main(): | |
183 | """Main program -- called when run as a script.""" | |
184 | import sys | |
185 | s = eval(sys.stdin.read()) | |
186 | sys.stdout.write(quote(convert(s))) | |
187 | if sys.stdout.isatty(): | |
188 | sys.stdout.write("\n") | |
189 | ||
190 | ||
191 | if __name__ == '__main__': | |
192 | main() |