Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / amd64 / man / man3 / Tcl_CreateEncoding.3
CommitLineData
920dae64
AT
1'\"
2'\" Copyright (c) 1997-1998 Sun Microsystems, Inc.
3'\"
4'\" See the file "license.terms" for information on usage and redistribution
5'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
6'\"
7'\" RCS: @(#) $Id: Encoding.3,v 1.11.2.1 2003/07/18 16:56:24 dgp Exp $
8'\"
9'\" The definitions below are for supplemental macros used in Tcl/Tk
10'\" manual entries.
11'\"
12'\" .AP type name in/out ?indent?
13'\" Start paragraph describing an argument to a library procedure.
14'\" type is type of argument (int, etc.), in/out is either "in", "out",
15'\" or "in/out" to describe whether procedure reads or modifies arg,
16'\" and indent is equivalent to second arg of .IP (shouldn't ever be
17'\" needed; use .AS below instead)
18'\"
19'\" .AS ?type? ?name?
20'\" Give maximum sizes of arguments for setting tab stops. Type and
21'\" name are examples of largest possible arguments that will be passed
22'\" to .AP later. If args are omitted, default tab stops are used.
23'\"
24'\" .BS
25'\" Start box enclosure. From here until next .BE, everything will be
26'\" enclosed in one large box.
27'\"
28'\" .BE
29'\" End of box enclosure.
30'\"
31'\" .CS
32'\" Begin code excerpt.
33'\"
34'\" .CE
35'\" End code excerpt.
36'\"
37'\" .VS ?version? ?br?
38'\" Begin vertical sidebar, for use in marking newly-changed parts
39'\" of man pages. The first argument is ignored and used for recording
40'\" the version when the .VS was added, so that the sidebars can be
41'\" found and removed when they reach a certain age. If another argument
42'\" is present, then a line break is forced before starting the sidebar.
43'\"
44'\" .VE
45'\" End of vertical sidebar.
46'\"
47'\" .DS
48'\" Begin an indented unfilled display.
49'\"
50'\" .DE
51'\" End of indented unfilled display.
52'\"
53'\" .SO
54'\" Start of list of standard options for a Tk widget. The
55'\" options follow on successive lines, in four columns separated
56'\" by tabs.
57'\"
58'\" .SE
59'\" End of list of standard options for a Tk widget.
60'\"
61'\" .OP cmdName dbName dbClass
62'\" Start of description of a specific option. cmdName gives the
63'\" option's name as specified in the class command, dbName gives
64'\" the option's name in the option database, and dbClass gives
65'\" the option's class in the option database.
66'\"
67'\" .UL arg1 arg2
68'\" Print arg1 underlined, then print arg2 normally.
69'\"
70'\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $
71'\"
72'\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
73.if t .wh -1.3i ^B
74.nr ^l \n(.l
75.ad b
76'\" # Start an argument description
77.de AP
78.ie !"\\$4"" .TP \\$4
79.el \{\
80. ie !"\\$2"" .TP \\n()Cu
81. el .TP 15
82.\}
83.ta \\n()Au \\n()Bu
84.ie !"\\$3"" \{\
85\&\\$1 \\fI\\$2\\fP (\\$3)
86.\".b
87.\}
88.el \{\
89.br
90.ie !"\\$2"" \{\
91\&\\$1 \\fI\\$2\\fP
92.\}
93.el \{\
94\&\\fI\\$1\\fP
95.\}
96.\}
97..
98'\" # define tabbing values for .AP
99.de AS
100.nr )A 10n
101.if !"\\$1"" .nr )A \\w'\\$1'u+3n
102.nr )B \\n()Au+15n
103.\"
104.if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
105.nr )C \\n()Bu+\\w'(in/out)'u+2n
106..
107.AS Tcl_Interp Tcl_CreateInterp in/out
108'\" # BS - start boxed text
109'\" # ^y = starting y location
110'\" # ^b = 1
111.de BS
112.br
113.mk ^y
114.nr ^b 1u
115.if n .nf
116.if n .ti 0
117.if n \l'\\n(.lu\(ul'
118.if n .fi
119..
120'\" # BE - end boxed text (draw box now)
121.de BE
122.nf
123.ti 0
124.mk ^t
125.ie n \l'\\n(^lu\(ul'
126.el \{\
127.\" Draw four-sided box normally, but don't draw top of
128.\" box if the box started on an earlier page.
129.ie !\\n(^b-1 \{\
130\h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
131.\}
132.el \}\
133\h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
134.\}
135.\}
136.fi
137.br
138.nr ^b 0
139..
140'\" # VS - start vertical sidebar
141'\" # ^Y = starting y location
142'\" # ^v = 1 (for troff; for nroff this doesn't matter)
143.de VS
144.if !"\\$2"" .br
145.mk ^Y
146.ie n 'mc \s12\(br\s0
147.el .nr ^v 1u
148..
149'\" # VE - end of vertical sidebar
150.de VE
151.ie n 'mc
152.el \{\
153.ev 2
154.nf
155.ti 0
156.mk ^t
157\h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
158.sp -1
159.fi
160.ev
161.\}
162.nr ^v 0
163..
164'\" # Special macro to handle page bottom: finish off current
165'\" # box/sidebar if in box/sidebar mode, then invoked standard
166'\" # page bottom macro.
167.de ^B
168.ev 2
169'ti 0
170'nf
171.mk ^t
172.if \\n(^b \{\
173.\" Draw three-sided box if this is the box's first page,
174.\" draw two sides but no top otherwise.
175.ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
176.el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
177.\}
178.if \\n(^v \{\
179.nr ^x \\n(^tu+1v-\\n(^Yu
180\kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
181.\}
182.bp
183'fi
184.ev
185.if \\n(^b \{\
186.mk ^y
187.nr ^b 2
188.\}
189.if \\n(^v \{\
190.mk ^Y
191.\}
192..
193'\" # DS - begin display
194.de DS
195.RS
196.nf
197.sp
198..
199'\" # DE - end display
200.de DE
201.fi
202.RE
203.sp
204..
205'\" # SO - start of list of standard options
206.de SO
207.SH "STANDARD OPTIONS"
208.LP
209.nf
210.ta 5.5c 11c
211.ft B
212..
213'\" # SE - end of list of standard options
214.de SE
215.fi
216.ft R
217.LP
218See the \\fBoptions\\fR manual entry for details on the standard options.
219..
220'\" # OP - start of full description for a single option
221.de OP
222.LP
223.nf
224.ta 4c
225Command-Line Name: \\fB\\$1\\fR
226Database Name: \\fB\\$2\\fR
227Database Class: \\fB\\$3\\fR
228.fi
229.IP
230..
231'\" # CS - begin code excerpt
232.de CS
233.RS
234.nf
235.ta .25i .5i .75i 1i
236..
237'\" # CE - end code excerpt
238.de CE
239.fi
240.RE
241..
242.de UL
243\\$1\l'|0\(ul'\\$2
244..
245.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
246.BS
247.SH NAME
248Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings.
249.SH SYNOPSIS
250.nf
251\fB#include <tcl.h>\fR
252.sp
253Tcl_Encoding
254\fBTcl_GetEncoding\fR(\fIinterp, name\fR)
255.sp
256void
257\fBTcl_FreeEncoding\fR(\fIencoding\fR)
258.sp
259char *
260\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
261.sp
262int
263\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
264 dstCharsPtr\fR)
265.sp
266char *
267\fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
268.sp
269int
270\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
271 dstCharsPtr\fR)
272.sp
273char *
274\fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR)
275.sp
276TCHAR *
277\fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR)
278.sp
279CONST char *
280\fBTcl_GetEncodingName\fR(\fIencoding\fR)
281.sp
282int
283\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
284.sp
285void
286\fBTcl_GetEncodingNames\fR(\fIinterp\fR)
287.sp
288Tcl_Encoding
289\fBTcl_CreateEncoding\fR(\fItypePtr\fR)
290.sp
291CONST char *
292\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
293.sp
294void
295\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)
296
297
298.SH ARGUMENTS
299.AS Tcl_EncodingState *dstWrotePtr
300.AP Tcl_Interp *interp in
301Interpreter to use for error reporting, or NULL if no error reporting is
302desired.
303.AP "CONST char" *name in
304Name of encoding to load.
305.AP Tcl_Encoding encoding in
306The encoding to query, free, or use for converting text. If \fIencoding\fR is
307NULL, the current system encoding is used.
308.AP "CONST char" *src in
309For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
310specified encoding that are to be converted to UTF-8. For the
311\fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of
312UTF-8 characters to be converted to the specified encoding.
313.AP "CONST TCHAR" *tsrc in
314An array of Windows TCHAR characters to convert to UTF-8.
315.AP int srcLen in
316Length of \fIsrc\fR or \fItsrc\fR in bytes. If the length is negative, the
317encoding-specific length of the string is used.
318.AP Tcl_DString *dstPtr out
319Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted
320result will be stored.
321.AP int flags in
322Various flag bits OR-ed together.
323TCL_ENCODING_START signifies that the
324source buffer is the first block in a (potentially multi-block) input
325stream, telling the conversion routine to reset to an initial state and
326perform any initialization that needs to occur before the first byte is
327converted. TCL_ENCODING_END signifies that the source buffer is the last
328block in a (potentially multi-block) input stream, telling the conversion
329routine to perform any finalization that needs to occur after the last
330byte is converted and then to reset to an initial state.
331TCL_ENCODING_STOPONERROR signifies that the conversion routine should
332return immediately upon reading a source character that doesn't exist in
333the target encoding; otherwise a default fallback character will
334automatically be substituted.
335.AP Tcl_EncodingState *statePtr in/out
336Used when converting a (generally long or indefinite length) byte stream
337in a piece by piece fashion. The conversion routine stores its current
338state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the
339current piece) has been converted; that state information must be passed
340back when converting the next piece of the stream so the conversion
341routine knows what state it was in when it left off at the end of the
342last piece. May be NULL, in which case the value specified for \fIflags\fR
343is ignored and the source buffer is assumed to contain the complete string to
344convert.
345.AP char *dst out
346Buffer in which the converted result will be stored. No more than
347\fIdstLen\fR bytes will be stored in \fIdst\fR.
348.AP int dstLen in
349The maximum length of the output buffer \fIdst\fR in bytes.
350.AP int *srcReadPtr out
351Filled with the number of bytes from \fIsrc\fR that were actually
352converted. This may be less than the original source length if there was
353a problem converting some source characters. May be NULL.
354.AP int *dstWrotePtr out
355Filled with the number of bytes that were actually stored in the output
356buffer as a result of the conversion. May be NULL.
357.AP int *dstCharsPtr out
358Filled with the number of characters that correspond to the number of bytes
359stored in the output buffer. May be NULL.
360.AP Tcl_EncodingType *typePtr in
361Structure that defines a new type of encoding.
362.AP "CONST char" *path in
363A path to the location of the encoding file.
364.BE
365.SH INTRODUCTION
366.PP
367These routines convert between Tcl's internal character representation,
368UTF-8, and character representations used by various operating systems or
369file systems, such as Unicode, ASCII, or Shift-JIS. When operating on
370strings, such as such as obtaining the names of files or displaying
371characters using international fonts, the strings must be translated into
372one or possibly multiple formats that the various system calls can use. For
373instance, on a Japanese Unix workstation, a user might obtain a filename
374represented in the EUC-JP file encoding and then translate the characters to
375the jisx0208 font encoding in order to display the filename in a Tk widget.
376The purpose of the encoding package is to help bridge the translation gap.
377UTF-8 provides an intermediate staging ground for all the various
378encodings. In the example above, text would be translated into UTF-8 from
379whatever file encoding the operating system is using. Then it would be
380translated from UTF-8 into whatever font encoding the display routines
381require.
382.PP
383Some basic encodings are compiled into Tcl. Others can be defined by the
384user or dynamically loaded from encoding files in a
385platform-independent manner.
386.SH DESCRIPTION
387.PP
388\fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR. The name may
389refer to a builtin Tcl encoding, a user-defined encoding registered by
390calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding
391file. The return value is a token that represents the encoding and can be
392used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR,
393\fBTcl_FreeEncoding\fR, and \fBTcl_UtfToExternal\fR. If the name did not
394refer to any known or loadable encoding, NULL is returned and an error
395message is returned in \fIinterp\fR.
396.PP
397The encoding package maintains a database of all encodings currently in use.
398The first time \fIname\fR is seen, \fBTcl_GetEncoding\fR returns an
399encoding with a reference count of 1. If the same \fIname\fR is requested
400further times, then the reference count for that encoding is incremented
401without the overhead of allocating a new encoding and all its associated
402data structures.
403.PP
404When an \fIencoding\fR is no longer needed, \fBTcl_FreeEncoding\fR
405should be called to release it. When an \fIencoding\fR is no longer in use
406anywhere (i.e., it has been freed as many times as it has been gotten)
407\fBTcl_FreeEncoding\fR will release all storage the encoding was using
408and delete it from the database.
409.PP
410\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
411specified \fIencoding\fR into UTF-8. The converted bytes are stored in
412\fIdstPtr\fR, which is then null-terminated. The caller should eventually
413call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR.
414When converting, if any of the characters in the source buffer cannot be
415represented in the target encoding, a default fallback character will be
416used. The return value is a pointer to the value stored in the DString.
417.PP
418\fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified
419\fIencoding\fR into UTF-8. Up to \fIsrcLen\fR bytes are converted from the
420source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR.
421In all cases, \fI*srcReadPtr\fR is filled with the number of bytes that were
422successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR is filled with
423the corresponding number of bytes that were stored in \fIdst\fR. The return
424value is one of the following:
425.RS
426.IP \fBTCL_OK\fR 29
427All bytes of \fIsrc\fR were converted.
428.IP \fBTCL_CONVERT_NOSPACE\fR 29
429The destination buffer was not large enough for all of the converted data; as
430many characters as could fit were converted though.
431.IP \fBTCL_CONVERT_MULTIBYTE\fR 29
432The last fews bytes in the source buffer were the beginning of a multibyte
433sequence, but more bytes were needed to complete this sequence. A
434subsequent call to the conversion routine should pass a buffer containing
435the unconverted bytes that remained in \fIsrc\fR plus some further bytes
436from the source stream to properly convert the formerly split-up multibyte
437sequence.
438.IP \fBTCL_CONVERT_SYNTAX\fR 29
439The source buffer contained an invalid character sequence. This may occur
440if the input stream has been damaged or if the input encoding method was
441misidentified.
442.IP \fBTCL_CONVERT_UNKNOWN\fR 29
443The source buffer contained a character that could not be represented in
444the target encoding and TCL_ENCODING_STOPONERROR was specified.
445.RE
446.LP
447\fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8
448into the specified \fIencoding\fR. The converted bytes are stored in
449\fIdstPtr\fR, which is then terminated with the appropriate encoding-specific
450null. The caller should eventually call \fBTcl_DStringFree\fR to free any
451information stored in \fIdstPtr\fR. When converting, if any of the
452characters in the source buffer cannot be represented in the target
453encoding, a default fallback character will be used. The return value is
454a pointer to the value stored in the DString.
455.PP
456\fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into
457the specified \fIencoding\fR. Up to \fIsrcLen\fR bytes are converted from
458the source buffer and up to \fIdstLen\fR converted bytes are stored in
459\fIdst\fR. In all cases, \fI*srcReadPtr\fR is filled with the number of
460bytes that were successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR
461is filled with the corresponding number of bytes that were stored in
462\fIdst\fR. The return values are the same as the return values for
463\fBTcl_ExternalToUtf\fR.
464.PP
465\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are
466Windows-only convenience
467functions for converting between UTF-8 and Windows strings. On Windows 95
468(as with the Macintosh and Unix operating systems),
469all strings exchanged between Tcl and the operating system are "char"
470based. On Windows NT, some strings exchanged between Tcl and the
471operating system are "char" oriented while others are in Unicode. By
472convention, in Windows a TCHAR is a character in the ANSI code page
473on Windows 95 and a Unicode character on Windows NT.
474.PP
475If you planned to use the same "char" based interfaces on both Windows
47695 and Windows NT, you could use \fBTcl_UtfToExternal\fR and
477\fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an
478encoding of NULL (the current system encoding). On the other hand,
479if you planned to use the Unicode interface when running on Windows NT
480and the "char" interfaces when running on Windows 95, you would have
481to perform the following type of test over and over in your program
482(as represented in pseudo-code):
483.CS
484if (running NT) {
485 encoding <- Tcl_GetEncoding("unicode");
486 nativeBuffer <- Tcl_UtfToExternal(encoding, utfBuffer);
487 Tcl_FreeEncoding(encoding);
488} else {
489 nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
490.CE
491\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically
492handle this test and use the proper encoding based on the current
493operating system. \fBTcl_WinUtfToTChar\fR returns a pointer to
494a TCHAR string, and \fBTcl_WinTCharToUtf\fR expects a TCHAR string
495pointer as the \fIsrc\fR string. Otherwise, these functions
496behave identically to \fBTcl_UtfToExternalDString\fR and
497\fBTcl_ExternalToUtfDString\fR.
498.PP
499\fBTcl_GetEncodingName\fR is roughly the inverse of \fBTcl_GetEncoding\fR.
500Given an \fIencoding\fR, the return value is the \fIname\fR argument that
501was used to create the encoding. The string returned by
502\fBTcl_GetEncodingName\fR is only guaranteed to persist until the
503\fIencoding\fR is deleted. The caller must not modify this string.
504.PP
505\fBTcl_SetSystemEncoding\fR sets the default encoding that should be used
506whenever the user passes a NULL value for the \fIencoding\fR argument to
507any of the other encoding functions. If \fIname\fR is NULL, the system
508encoding is reset to the default system encoding, \fBbinary\fR. If the
509name did not refer to any known or loadable encoding, TCL_ERROR is
510returned and an error message is left in \fIinterp\fR. Otherwise, this
511procedure increments the reference count of the new system encoding,
512decrements the reference count of the old system encoding, and returns
513TCL_OK.
514.PP
515\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
516consisting of the names of all the encodings that are currently defined
517or can be dynamically loaded, searching the encoding path specified by
518\fBTcl_SetDefaultEncodingDir\fR. This procedure does not ensure that the
519dynamically-loadable encoding files contain valid data, but merely that they
520exist.
521.PP
522\fBTcl_CreateEncoding\fR defines a new encoding and registers the C
523procedures that are called back to convert between the encoding and
524UTF-8. Encodings created by \fBTcl_CreateEncoding\fR are thereafter
525visible in the database used by \fBTcl_GetEncoding\fR. Just as with the
526\fBTcl_GetEncoding\fR procedure, the return value is a token that
527represents the encoding and can be used in subsequent calls to other
528encoding functions. \fBTcl_CreateEncoding\fR returns an encoding with a
529reference count of 1. If an encoding with the specified \fIname\fR
530already exists, then its entry in the database is replaced with the new
531encoding; the token for the old encoding will remain valid and continue
532to behave as before, but users of the new token will now call the new
533encoding procedures.
534.PP
535The \fItypePtr\fR argument to \fBTcl_CreateEncoding\fR contains information
536about the name of the encoding and the procedures that will be called to
537convert between this encoding and UTF-8. It is defined as follows:
538.PP
539.CS
540typedef struct Tcl_EncodingType {
541 CONST char *\fIencodingName\fR;
542 Tcl_EncodingConvertProc *\fItoUtfProc\fR;
543 Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
544 Tcl_EncodingFreeProc *\fIfreeProc\fR;
545 ClientData \fIclientData\fR;
546 int \fInullSize\fR;
547} Tcl_EncodingType;
548.CE
549.PP
550The \fIencodingName\fR provides a string name for the encoding, by
551which it can be referred in other procedures such as
552\fBTcl_GetEncoding\fR. The \fItoUtfProc\fR refers to a callback
553procedure to invoke to convert text from this encoding into UTF-8.
554The \fIfromUtfProc\fR refers to a callback procedure to invoke to
555convert text from UTF-8 into this encoding. The \fIfreeProc\fR refers
556to a callback procedure to invoke when this encoding is deleted. The
557\fIfreeProc\fR field may be NULL. The \fIclientData\fR contains an
558arbitrary one-word value passed to \fItoUtfProc\fR, \fIfromUtfProc\fR,
559and \fIfreeProc\fR whenever they are called. Typically, this is a
560pointer to a data structure containing encoding-specific information
561that can be used by the callback procedures. For instance, two very
562similar encodings such as \fBascii\fR and \fBmacRoman\fR may use the
563same callback procedure, but use different values of \fIclientData\fR
564to control its behavior. The \fInullSize\fR specifies the number of
565zero bytes that signify end-of-string in this encoding. It must be
566\fB1\fR (for single-byte or multi-byte encodings like ASCII or
567Shift-JIS) or \fB2\fR (for double-byte encodings like Unicode).
568Constant-sized encodings with 3 or more bytes per character (such as
569CNS11643) are not accepted.
570.PP
571The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
572type \fBTcl_EncodingConvertProc\fR:
573.PP
574.CS
575typedef int Tcl_EncodingConvertProc(
576 ClientData \fIclientData\fR,
577 CONST char *\fIsrc\fR,
578 int \fIsrcLen\fR,
579 int \fIflags\fR,
580 Tcl_Encoding *\fIstatePtr\fR,
581 char *\fIdst\fR,
582 int \fIdstLen\fR,
583 int *\fIsrcReadPtr\fR,
584 int *\fIdstWrotePtr\fR,
585 int *\fIdstCharsPtr\fR);
586.CE
587.PP
588The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the
589\fBTcl_ExternalToUtf\fR or \fBTcl_UtfToExternal\fR family of functions to
590perform the actual conversion. The \fIclientData\fR parameter to these
591procedures is the same as the \fIclientData\fR field specified to
592\fBTcl_CreateEncoding\fR when the encoding was created. The remaining
593arguments to the callback procedures are the same as the arguments,
594documented at the top, to \fBTcl_ExternalToUtf\fR or
595\fBTcl_UtfToExternal\fR, with the following exceptions. If the
596\fIsrcLen\fR argument to one of those high-level functions is negative,
597the value passed to the callback procedure will be the appropriate
598encoding-specific string length of \fIsrc\fR. If any of the \fIsrcReadPtr\fR,
599\fIdstWrotePtr\fR, or \fIdstCharsPtr\fR arguments to one of the high-level
600functions is NULL, the corresponding value passed to the callback
601procedure will be a non-NULL location.
602.PP
603The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
604\fBTcl_EncodingFreeProc\fR:
605.CS
606typedef void Tcl_EncodingFreeProc(
607 ClientData \fIclientData\fR);
608.CE
609.PP
610This \fIfreeProc\fR function is called when the encoding is deleted. The
611\fIclientData\fR parameter is the same as the \fIclientData\fR field
612specified to \fBTcl_CreateEncoding\fR when the encoding was created.
613.PP
614
615\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
616access and set the directory to use when locating the default encoding
617files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine
618appends the path to the head of the search path, and uses this path as
619the first place to look into when trying to locate the encoding file.
620
621.SH "ENCODING FILES"
622Space would prohibit precompiling into Tcl every possible encoding
623algorithm, so many encodings are stored on disk as dynamically-loadable
624encoding files. This behavior also allows the user to create additional
625encoding files that can be loaded using the same mechanism. These
626encoding files contain information about the tables and/or escape
627sequences used to map between an external encoding and Unicode. The
628external encoding may consist of single-byte, multi-byte, or double-byte
629characters.
630.PP
631Each dynamically-loadable encoding is represented as a text file. The
632initial line of the file, beginning with a ``#'' symbol, is a comment
633that provides a human-readable description of the file. The next line
634identifies the type of encoding file. It can be one of the following
635letters:
636.IP "[1] \fBS\fR"
637A single-byte encoding, where one character is always one byte long in the
638encoding. An example is \fBiso8859-1\fR, used by many European languages.
639.IP "[2] \fBD\fR"
640A double-byte encoding, where one character is always two bytes long in the
641encoding. An example is \fBbig5\fR, used for Chinese text.
642.IP "[3] \fBM\fR"
643A multi-byte encoding, where one character may be either one or two bytes long.
644Certain bytes are a lead bytes, indicating that another byte must follow
645and that together the two bytes represent one character. Other bytes are not
646lead bytes and represent themselves. An example is \fBshiftjis\fR, used by
647many Japanese computers.
648.IP "[4] \fBE\fR"
649An escape-sequence encoding, specifying that certain sequences of bytes
650do not represent characters, but commands that describe how following bytes
651should be interpreted.
652.PP
653The rest of the lines in the file depend on the type.
654.PP
655Cases [1], [2], and [3] are collectively referred to as table-based encoding
656files. The lines in a table-based encoding file are in the same
657format as this example taken from the \fBshiftjis\fR encoding (this is not
658the complete file):
659.CS
660# Encoding file: shiftjis, multi-byte
661M
662003F 0 40
66300
6640000000100020003000400050006000700080009000A000B000C000D000E000F
6650010001100120013001400150016001700180019001A001B001C001D001E001F
6660020002100220023002400250026002700280029002A002B002C002D002E002F
6670030003100320033003400350036003700380039003A003B003C003D003E003F
6680040004100420043004400450046004700480049004A004B004C004D004E004F
6690050005100520053005400550056005700580059005A005B005C005D005E005F
6700060006100620063006400650066006700680069006A006B006C006D006E006F
6710070007100720073007400750076007700780079007A007B007C007D203E007F
6720080000000000000000000000000000000000000000000000000000000000000
6730000000000000000000000000000000000000000000000000000000000000000
6740000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
675FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
676FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
677FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
6780000000000000000000000000000000000000000000000000000000000000000
6790000000000000000000000000000000000000000000000000000000000000000
68081
6810000000000000000000000000000000000000000000000000000000000000000
6820000000000000000000000000000000000000000000000000000000000000000
6830000000000000000000000000000000000000000000000000000000000000000
6840000000000000000000000000000000000000000000000000000000000000000
685300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
686FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
687301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
688FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
68900F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
690FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
69125A125A025B325B225BD25BC203B301221922190219121933013000000000000
692000000000000000000000000000000002208220B2286228722822283222A2229
693000000000000000000000000000000002227222800AC21D221D4220022030000
6940000000000000000000000000000000000000000222022A52312220222072261
6952252226A226B221A223D221D2235222B222C0000000000000000000000000000
696212B2030266F266D266A2020202100B6000000000000000025EF000000000000
697.CE
698.PP
699The third line of the file is three numbers. The first number is the
700fallback character (in base 16) to use when converting from UTF-8 to this
701encoding. The second number is a \fB1\fR if this file represents the
702encoding for a symbol font, or \fB0\fR otherwise. The last number (in base
70310) is how many pages of data follow.
704.PP
705Subsequent lines in the example above are pages that describe how to map
706from the encoding into 2-byte Unicode. The first line in a page identifies
707the page number. Following it are 256 double-byte numbers, arranged as 16
708rows of 16 numbers. Given a character in the encoding, the high byte of
709that character is used to select which page, and the low byte of that
710character is used as an index to select one of the double-byte numbers in
711that page \- the value obtained being the corresponding Unicode character.
712By examination of the example above, one can see that the characters 0x7E
713and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
714.PP
715Following the first page will be all the other pages, each in the same
716format as the first: one number identifying the page followed by 256
717double-byte Unicode characters. If a character in the encoding maps to the
718Unicode character 0000, it means that the character doesn't actually exist.
719If all characters on a page would map to 0000, that page can be omitted.
720.PP
721Case [4] is the escape-sequence encoding file. The lines in an this type of
722file are in the same format as this example taken from the \fBiso2022-jp\fR
723encoding:
724.CS
725.ta 1.5i
726# Encoding file: iso2022-jp, escape-driven
727E
728init {}
729final {}
730iso8859-1 \\x1b(B
731jis0201 \\x1b(J
732jis0208 \\x1b$@
733jis0208 \\x1b$B
734jis0212 \\x1b$(D
735gb2312 \\x1b$A
736ksc5601 \\x1b$(C
737.CE
738.PP
739In the file, the first column represents an option and the second column
740is the associated value. \fBinit\fR is a string to emit or expect before
741the first character is converted, while \fBfinal\fR is a string to emit
742or expect after the last character. All other options are names of
743table-based encodings; the associated value is the escape-sequence that
744marks that encoding. Tcl syntax is used for the values; in the above
745example, for instance, ``\fB{}\fR'' represents the empty string and
746``\fB\\x1b\fR'' represents character 27.
747.PP
748When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
749been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
750from the \fBencoding\fR subdirectory of each directory specified in the
751library path \fB$tcl_libPath\fR. If the encoding file exists, but is
752malformed, an error message will be left in \fIinterp\fR.
753.SH KEYWORDS
754utf, encoding, convert
755
756
757