Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::Encoding; |
2 | # Base class for classes which implement encodings | |
3 | use strict; | |
4 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
5 | ||
6 | require Encode; | |
7 | ||
8 | sub DEBUG { 0 } | |
9 | sub Define | |
10 | { | |
11 | my $obj = shift; | |
12 | my $canonical = shift; | |
13 | $obj = bless { Name => $canonical },$obj unless ref $obj; | |
14 | # warn "$canonical => $obj\n"; | |
15 | Encode::define_encoding($obj, $canonical, @_); | |
16 | } | |
17 | ||
18 | sub name { return shift->{'Name'} } | |
19 | ||
20 | # sub renew { return $_[0] } | |
21 | ||
22 | sub renew { | |
23 | my $self = shift; | |
24 | my $clone = bless { %$self } => ref($self); | |
25 | $clone->{renewed}++; # so the caller can see it | |
26 | DEBUG and warn $clone->{renewed}; | |
27 | return $clone; | |
28 | } | |
29 | ||
30 | sub renewed{ return $_[0]->{renewed} || 0 } | |
31 | ||
32 | *new_sequence = \&renew; | |
33 | ||
34 | sub needs_lines { 0 }; | |
35 | ||
36 | sub perlio_ok { | |
37 | eval{ require PerlIO::encoding }; | |
38 | return $@ ? 0 : 1; | |
39 | } | |
40 | ||
41 | # (Temporary|legacy) methods | |
42 | ||
43 | sub toUnicode { shift->decode(@_) } | |
44 | sub fromUnicode { shift->encode(@_) } | |
45 | ||
46 | # | |
47 | # Needs to be overloaded or just croak | |
48 | # | |
49 | ||
50 | sub encode { | |
51 | require Carp; | |
52 | my $obj = shift; | |
53 | my $class = ref($obj) ? ref($obj) : $obj; | |
54 | Carp::croak($class . "->encode() not defined!"); | |
55 | } | |
56 | ||
57 | sub decode{ | |
58 | require Carp; | |
59 | my $obj = shift; | |
60 | my $class = ref($obj) ? ref($obj) : $obj; | |
61 | Carp::croak($class . "->encode() not defined!"); | |
62 | } | |
63 | ||
64 | sub DESTROY {} | |
65 | ||
66 | 1; | |
67 | __END__ | |
68 | ||
69 | =head1 NAME | |
70 | ||
71 | Encode::Encoding - Encode Implementation Base Class | |
72 | ||
73 | =head1 SYNOPSIS | |
74 | ||
75 | package Encode::MyEncoding; | |
76 | use base qw(Encode::Encoding); | |
77 | ||
78 | __PACKAGE__->Define(qw(myCanonical myAlias)); | |
79 | ||
80 | =head1 DESCRIPTION | |
81 | ||
82 | As mentioned in L<Encode>, encodings are (in the current | |
83 | implementation at least) defined as objects. The mapping of encoding | |
84 | name to object is via the C<%Encode::Encoding> hash. Though you can | |
85 | directly manipulate this hash, it is strongly encouraged to use this | |
86 | base class module and add encode() and decode() methods. | |
87 | ||
88 | =head2 Methods you should implement | |
89 | ||
90 | You are strongly encouraged to implement methods below, at least | |
91 | either encode() or decode(). | |
92 | ||
93 | =over 4 | |
94 | ||
95 | =item -E<gt>encode($string [,$check]) | |
96 | ||
97 | MUST return the octet sequence representing I<$string>. | |
98 | ||
99 | =over 2 | |
100 | ||
101 | =item * | |
102 | ||
103 | If I<$check> is true, it SHOULD modify I<$string> in place to remove | |
104 | the converted part (i.e. the whole string unless there is an error). | |
105 | If perlio_ok() is true, SHOULD becomes MUST. | |
106 | ||
107 | =item * | |
108 | ||
109 | If an error occurs, it SHOULD return the octet sequence for the | |
110 | fragment of string that has been converted and modify $string in-place | |
111 | to remove the converted part leaving it starting with the problem | |
112 | fragment. If perlio_ok() is true, SHOULD becomes MUST. | |
113 | ||
114 | =item * | |
115 | ||
116 | If I<$check> is is false then C<encode> MUST make a "best effort" to | |
117 | convert the string - for example, by using a replacement character. | |
118 | ||
119 | =back | |
120 | ||
121 | =item -E<gt>decode($octets [,$check]) | |
122 | ||
123 | MUST return the string that I<$octets> represents. | |
124 | ||
125 | =over 2 | |
126 | ||
127 | =item * | |
128 | ||
129 | If I<$check> is true, it SHOULD modify I<$octets> in place to remove | |
130 | the converted part (i.e. the whole sequence unless there is an | |
131 | error). If perlio_ok() is true, SHOULD becomes MUST. | |
132 | ||
133 | =item * | |
134 | ||
135 | If an error occurs, it SHOULD return the fragment of string that has | |
136 | been converted and modify $octets in-place to remove the converted | |
137 | part leaving it starting with the problem fragment. If perlio_ok() is | |
138 | true, SHOULD becomes MUST. | |
139 | ||
140 | =item * | |
141 | ||
142 | If I<$check> is false then C<decode> should make a "best effort" to | |
143 | convert the string - for example by using Unicode's "\x{FFFD}" as a | |
144 | replacement character. | |
145 | ||
146 | =back | |
147 | ||
148 | =back | |
149 | ||
150 | If you want your encoding to work with L<encoding> pragma, you should | |
151 | also implement the method below. | |
152 | ||
153 | =over 4 | |
154 | ||
155 | =item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check]) | |
156 | ||
157 | MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>. | |
158 | Decoding will terminate when $terminator (a string) appears in output. | |
159 | I<$offset> will be modified to the last $octets position at end of decode. | |
160 | Returns true if $terminator appears output, else returns false. | |
161 | ||
162 | =back | |
163 | ||
164 | =head2 Other methods defined in Encode::Encodings | |
165 | ||
166 | You do not have to override methods shown below unless you have to. | |
167 | ||
168 | =over 4 | |
169 | ||
170 | =item -E<gt>name | |
171 | ||
172 | Predefined As: | |
173 | ||
174 | sub name { return shift->{'Name'} } | |
175 | ||
176 | MUST return the string representing the canonical name of the encoding. | |
177 | ||
178 | =item -E<gt>renew | |
179 | ||
180 | Predefined As: | |
181 | ||
182 | sub renew { | |
183 | my $self = shift; | |
184 | my $clone = bless { %$self } => ref($self); | |
185 | $clone->{renewed}++; | |
186 | return $clone; | |
187 | } | |
188 | ||
189 | This method reconstructs the encoding object if necessary. If you need | |
190 | to store the state during encoding, this is where you clone your object. | |
191 | ||
192 | PerlIO ALWAYS calls this method to make sure it has its own private | |
193 | encoding object. | |
194 | ||
195 | =item -E<gt>renewed | |
196 | ||
197 | Predefined As: | |
198 | ||
199 | sub renewed { $_[0]->{renewed} || 0 } | |
200 | ||
201 | Tells whether the object is renewed (and how many times). Some | |
202 | modules emit C<Use of uninitialized value in null operation> warning | |
203 | unless the value is numeric so return 0 for false. | |
204 | ||
205 | =item -E<gt>perlio_ok() | |
206 | ||
207 | Predefined As: | |
208 | ||
209 | sub perlio_ok { | |
210 | eval{ require PerlIO::encoding }; | |
211 | return $@ ? 0 : 1; | |
212 | } | |
213 | ||
214 | If your encoding does not support PerlIO for some reasons, just; | |
215 | ||
216 | sub perlio_ok { 0 } | |
217 | ||
218 | =item -E<gt>needs_lines() | |
219 | ||
220 | Predefined As: | |
221 | ||
222 | sub needs_lines { 0 }; | |
223 | ||
224 | If your encoding can work with PerlIO but needs line buffering, you | |
225 | MUST define this method so it returns true. 7bit ISO-2022 encodings | |
226 | are one example that needs this. When this method is missing, false | |
227 | is assumed. | |
228 | ||
229 | =back | |
230 | ||
231 | =head2 Example: Encode::ROT13 | |
232 | ||
233 | package Encode::ROT13; | |
234 | use strict; | |
235 | use base qw(Encode::Encoding); | |
236 | ||
237 | __PACKAGE__->Define('rot13'); | |
238 | ||
239 | sub encode($$;$){ | |
240 | my ($obj, $str, $chk) = @_; | |
241 | $str =~ tr/A-Za-z/N-ZA-Mn-za-m/; | |
242 | $_[1] = '' if $chk; # this is what in-place edit means | |
243 | return $str; | |
244 | } | |
245 | ||
246 | # Jr pna or ynml yvxr guvf; | |
247 | *decode = \&encode; | |
248 | ||
249 | 1; | |
250 | ||
251 | =head1 Why the heck Encode API is different? | |
252 | ||
253 | It should be noted that the I<$check> behaviour is different from the | |
254 | outer public API. The logic is that the "unchecked" case is useful | |
255 | when the encoding is part of a stream which may be reporting errors | |
256 | (e.g. STDERR). In such cases, it is desirable to get everything | |
257 | through somehow without causing additional errors which obscure the | |
258 | original one. Also, the encoding is best placed to know what the | |
259 | correct replacement character is, so if that is the desired behaviour | |
260 | then letting low level code do it is the most efficient. | |
261 | ||
262 | By contrast, if I<$check> is true, the scheme above allows the | |
263 | encoding to do as much as it can and tell the layer above how much | |
264 | that was. What is lacking at present is a mechanism to report what | |
265 | went wrong. The most likely interface will be an additional method | |
266 | call to the object, or perhaps (to avoid forcing per-stream objects | |
267 | on otherwise stateless encodings) an additional parameter. | |
268 | ||
269 | It is also highly desirable that encoding classes inherit from | |
270 | C<Encode::Encoding> as a base class. This allows that class to define | |
271 | additional behaviour for all encoding objects. | |
272 | ||
273 | package Encode::MyEncoding; | |
274 | use base qw(Encode::Encoding); | |
275 | ||
276 | __PACKAGE__->Define(qw(myCanonical myAlias)); | |
277 | ||
278 | to create an object with C<< bless {Name => ...}, $class >>, and call | |
279 | define_encoding. They inherit their C<name> method from | |
280 | C<Encode::Encoding>. | |
281 | ||
282 | =head2 Compiled Encodings | |
283 | ||
284 | For the sake of speed and efficiency, most of the encodings are now | |
285 | supported via a I<compiled form>: XS modules generated from UCM | |
286 | files. Encode provides the enc2xs tool to achieve that. Please see | |
287 | L<enc2xs> for more details. | |
288 | ||
289 | =head1 SEE ALSO | |
290 | ||
291 | L<perlmod>, L<enc2xs> | |
292 | ||
293 | =begin future | |
294 | ||
295 | =over 4 | |
296 | ||
297 | =item Scheme 1 | |
298 | ||
299 | The fixup routine gets passed the remaining fragment of string being | |
300 | processed. It modifies it in place to remove bytes/characters it can | |
301 | understand and returns a string used to represent them. For example: | |
302 | ||
303 | sub fixup { | |
304 | my $ch = substr($_[0],0,1,''); | |
305 | return sprintf("\x{%02X}",ord($ch); | |
306 | } | |
307 | ||
308 | This scheme is close to how the underlying C code for Encode works, | |
309 | but gives the fixup routine very little context. | |
310 | ||
311 | =item Scheme 2 | |
312 | ||
313 | The fixup routine gets passed the original string, an index into | |
314 | it of the problem area, and the output string so far. It appends | |
315 | what it wants to the output string and returns a new index into the | |
316 | original string. For example: | |
317 | ||
318 | sub fixup { | |
319 | # my ($s,$i,$d) = @_; | |
320 | my $ch = substr($_[0],$_[1],1); | |
321 | $_[2] .= sprintf("\x{%02X}",ord($ch); | |
322 | return $_[1]+1; | |
323 | } | |
324 | ||
325 | This scheme gives maximal control to the fixup routine but is more | |
326 | complicated to code, and may require that the internals of Encode be tweaked to | |
327 | keep the original string intact. | |
328 | ||
329 | =item Other Schemes | |
330 | ||
331 | Hybrids of the above. | |
332 | ||
333 | Multiple return values rather than in-place modifications. | |
334 | ||
335 | Index into the string could be C<pos($str)> allowing C<s/\G...//>. | |
336 | ||
337 | =back | |
338 | ||
339 | =end future | |
340 | ||
341 | =cut |