Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package Encode::Encoding; |
2 | # Base class for classes which implement encodings | |
3 | use strict; | |
4 | our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
5 | ||
6 | require Encode; | |
7 | ||
8 | sub Define | |
9 | { | |
10 | my $obj = shift; | |
11 | my $canonical = shift; | |
12 | $obj = bless { Name => $canonical },$obj unless ref $obj; | |
13 | # warn "$canonical => $obj\n"; | |
14 | Encode::define_encoding($obj, $canonical, @_); | |
15 | } | |
16 | ||
17 | sub name { return shift->{'Name'} } | |
18 | sub new_sequence { return $_[0] } | |
19 | ||
20 | sub needs_lines { 0 }; | |
21 | ||
22 | sub perlio_ok { | |
23 | eval{ require PerlIO::encoding }; | |
24 | return $@ ? 0 : 1; | |
25 | } | |
26 | ||
27 | # Temporary legacy methods | |
28 | sub toUnicode { shift->decode(@_) } | |
29 | sub fromUnicode { shift->encode(@_) } | |
30 | ||
31 | # | |
32 | # Needs to be overloaded or just croak | |
33 | # | |
34 | ||
35 | sub encode { | |
36 | require Carp; | |
37 | my $obj = shift; | |
38 | my $class = ref($obj) ? ref($obj) : $obj; | |
39 | Carp::croak $class, "->encode() not defined!"; | |
40 | } | |
41 | ||
42 | sub decode{ | |
43 | require Carp; | |
44 | my $obj = shift; | |
45 | my $class = ref($obj) ? ref($obj) : $obj; | |
46 | Carp::croak $class, "->encode() not defined!"; | |
47 | } | |
48 | ||
49 | sub DESTROY {} | |
50 | ||
51 | 1; | |
52 | __END__ | |
53 | ||
54 | =head1 NAME | |
55 | ||
56 | Encode::Encoding - Encode Implementation Base Class | |
57 | ||
58 | =head1 SYNOPSIS | |
59 | ||
60 | package Encode::MyEncoding; | |
61 | use base qw(Encode::Encoding); | |
62 | ||
63 | __PACKAGE__->Define(qw(myCanonical myAlias)); | |
64 | ||
65 | =head1 DESCRIPTION | |
66 | ||
67 | As mentioned in L<Encode>, encodings are (in the current | |
68 | implementation at least) defined as objects. The mapping of encoding | |
69 | name to object is via the C<%Encode::Encoding> hash. Though you can | |
70 | directly manipulate this hash, it is strongly encouraged to use this | |
71 | base class module and add encode() and decode() methods. | |
72 | ||
73 | =head2 Methods you should implement | |
74 | ||
75 | You are strongly encouraged to implement methods below, at least | |
76 | either encode() or decode(). | |
77 | ||
78 | =over 4 | |
79 | ||
80 | =item -E<gt>encode($string [,$check]) | |
81 | ||
82 | MUST return the octet sequence representing I<$string>. | |
83 | ||
84 | =over 2 | |
85 | ||
86 | =item * | |
87 | ||
88 | If I<$check> is true, it SHOULD modify I<$string> in place to remove | |
89 | the converted part (i.e. the whole string unless there is an error). | |
90 | If perlio_ok() is true, SHOULD becomes MUST. | |
91 | ||
92 | =item * | |
93 | ||
94 | If an error occurs, it SHOULD return the octet sequence for the | |
95 | fragment of string that has been converted and modify $string in-place | |
96 | to remove the converted part leaving it starting with the problem | |
97 | fragment. If perlio_ok() is true, SHOULD becomes MUST. | |
98 | ||
99 | =item * | |
100 | ||
101 | If I<$check> is is false then C<encode> MUST make a "best effort" to | |
102 | convert the string - for example, by using a replacement character. | |
103 | ||
104 | =back | |
105 | ||
106 | =item -E<gt>decode($octets [,$check]) | |
107 | ||
108 | MUST return the string that I<$octets> represents. | |
109 | ||
110 | =over 2 | |
111 | ||
112 | =item * | |
113 | ||
114 | If I<$check> is true, it SHOULD modify I<$octets> in place to remove | |
115 | the converted part (i.e. the whole sequence unless there is an | |
116 | error). If perlio_ok() is true, SHOULD becomes MUST. | |
117 | ||
118 | =item * | |
119 | ||
120 | If an error occurs, it SHOULD return the fragment of string that has | |
121 | been converted and modify $octets in-place to remove the converted | |
122 | part leaving it starting with the problem fragment. If perlio_ok() is | |
123 | true, SHOULD becomes MUST. | |
124 | ||
125 | =item * | |
126 | ||
127 | If I<$check> is false then C<decode> should make a "best effort" to | |
128 | convert the string - for example by using Unicode's "\x{FFFD}" as a | |
129 | replacement character. | |
130 | ||
131 | =back | |
132 | ||
133 | =head2 Other methods defined in Encode::Encodings | |
134 | ||
135 | You do not have to override methods shown below unless you have to. | |
136 | ||
137 | =over 4 | |
138 | ||
139 | =item -E<gt>name | |
140 | ||
141 | Predefined As: | |
142 | ||
143 | sub name { return shift->{'Name'} } | |
144 | ||
145 | MUST return the string representing the canonical name of the encoding. | |
146 | ||
147 | =item -E<gt>new_sequence | |
148 | ||
149 | Predefined As: | |
150 | ||
151 | sub new_sequence { return $_[0] } | |
152 | ||
153 | This is a placeholder for encodings with state. It should return an | |
154 | object which implements this interface. All current implementations | |
155 | return the original object. | |
156 | ||
157 | =item -E<gt>perlio_ok() | |
158 | ||
159 | Predefined As: | |
160 | ||
161 | sub perlio_ok { | |
162 | eval{ require PerlIO::encoding }; | |
163 | return $@ ? 0 : 1; | |
164 | } | |
165 | ||
166 | If your encoding does not support PerlIO for some reasons, just; | |
167 | ||
168 | sub perlio_ok { 0 } | |
169 | ||
170 | =item -E<gt>needs_lines() | |
171 | ||
172 | Predefined As: | |
173 | ||
174 | sub needs_lines { 0 }; | |
175 | ||
176 | If your encoding can work with PerlIO but needs line buffering, you | |
177 | MUST define this method so it returns true. 7bit ISO-2022 encodings | |
178 | are one example that needs this. When this method is missing, false | |
179 | is assumed. | |
180 | ||
181 | =back | |
182 | ||
183 | =head2 Example: Encode::ROT13 | |
184 | ||
185 | package Encode::ROT13; | |
186 | use strict; | |
187 | use base qw(Encode::Encoding); | |
188 | ||
189 | __PACKAGE__->Define('rot13'); | |
190 | ||
191 | sub encode($$;$){ | |
192 | my ($obj, $str, $chk) = @_; | |
193 | $str =~ tr/A-Za-z/N-ZA-Mn-za-m/; | |
194 | $_[1] = '' if $chk; # this is what in-place edit means | |
195 | return $str; | |
196 | } | |
197 | ||
198 | # Jr pna or ynml yvxr guvf; | |
199 | *decode = \&encode; | |
200 | ||
201 | 1; | |
202 | ||
203 | =head1 Why the heck Encode API is different? | |
204 | ||
205 | It should be noted that the I<$check> behaviour is different from the | |
206 | outer public API. The logic is that the "unchecked" case is useful | |
207 | when the encoding is part of a stream which may be reporting errors | |
208 | (e.g. STDERR). In such cases, it is desirable to get everything | |
209 | through somehow without causing additional errors which obscure the | |
210 | original one. Also, the encoding is best placed to know what the | |
211 | correct replacement character is, so if that is the desired behaviour | |
212 | then letting low level code do it is the most efficient. | |
213 | ||
214 | By contrast, if I<$check> is true, the scheme above allows the | |
215 | encoding to do as much as it can and tell the layer above how much | |
216 | that was. What is lacking at present is a mechanism to report what | |
217 | went wrong. The most likely interface will be an additional method | |
218 | call to the object, or perhaps (to avoid forcing per-stream objects | |
219 | on otherwise stateless encodings) an additional parameter. | |
220 | ||
221 | It is also highly desirable that encoding classes inherit from | |
222 | C<Encode::Encoding> as a base class. This allows that class to define | |
223 | additional behaviour for all encoding objects. | |
224 | ||
225 | package Encode::MyEncoding; | |
226 | use base qw(Encode::Encoding); | |
227 | ||
228 | __PACKAGE__->Define(qw(myCanonical myAlias)); | |
229 | ||
230 | to create an object with C<< bless {Name => ...}, $class >>, and call | |
231 | define_encoding. They inherit their C<name> method from | |
232 | C<Encode::Encoding>. | |
233 | ||
234 | =head2 Compiled Encodings | |
235 | ||
236 | For the sake of speed and efficiency, most of the encodings are now | |
237 | supported via a I<compiled form>: XS modules generated from UCM | |
238 | files. Encode provides the enc2xs tool to achieve that. Please see | |
239 | L<enc2xs> for more details. | |
240 | ||
241 | =head1 SEE ALSO | |
242 | ||
243 | L<perlmod>, L<enc2xs> | |
244 | ||
245 | =begin future | |
246 | ||
247 | =over 4 | |
248 | ||
249 | =item Scheme 1 | |
250 | ||
251 | The fixup routine gets passed the remaining fragment of string being | |
252 | processed. It modifies it in place to remove bytes/characters it can | |
253 | understand and returns a string used to represent them. For example: | |
254 | ||
255 | sub fixup { | |
256 | my $ch = substr($_[0],0,1,''); | |
257 | return sprintf("\x{%02X}",ord($ch); | |
258 | } | |
259 | ||
260 | This scheme is close to how the underlying C code for Encode works, | |
261 | but gives the fixup routine very little context. | |
262 | ||
263 | =item Scheme 2 | |
264 | ||
265 | The fixup routine gets passed the original string, an index into | |
266 | it of the problem area, and the output string so far. It appends | |
267 | what it wants to the output string and returns a new index into the | |
268 | original string. For example: | |
269 | ||
270 | sub fixup { | |
271 | # my ($s,$i,$d) = @_; | |
272 | my $ch = substr($_[0],$_[1],1); | |
273 | $_[2] .= sprintf("\x{%02X}",ord($ch); | |
274 | return $_[1]+1; | |
275 | } | |
276 | ||
277 | This scheme gives maximal control to the fixup routine but is more | |
278 | complicated to code, and may require that the internals of Encode be tweaked to | |
279 | keep the original string intact. | |
280 | ||
281 | =item Other Schemes | |
282 | ||
283 | Hybrids of the above. | |
284 | ||
285 | Multiple return values rather than in-place modifications. | |
286 | ||
287 | Index into the string could be C<pos($str)> allowing C<s/\G...//>. | |
288 | ||
289 | =back | |
290 | ||
291 | =end future | |
292 | ||
293 | =cut |