Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Unicode::Normalize.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Unicode::Normalize 3"
132.TH Unicode::Normalize 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134Unicode::Normalize \- Unicode Normalization Forms
135.SH "SYNOPSIS"
136.IX Header "SYNOPSIS"
137.Vb 1
138\& use Unicode::Normalize;
139.Ve
140.PP
141.Vb 4
142\& $NFD_string = NFD($string); # Normalization Form D
143\& $NFC_string = NFC($string); # Normalization Form C
144\& $NFKD_string = NFKD($string); # Normalization Form KD
145\& $NFKC_string = NFKC($string); # Normalization Form KC
146.Ve
147.PP
148.Vb 1
149\& or
150.Ve
151.PP
152.Vb 1
153\& use Unicode::Normalize 'normalize';
154.Ve
155.PP
156.Vb 4
157\& $NFD_string = normalize('D', $string); # Normalization Form D
158\& $NFC_string = normalize('C', $string); # Normalization Form C
159\& $NFKD_string = normalize('KD', $string); # Normalization Form KD
160\& $NFKC_string = normalize('KC', $string); # Normalization Form KC
161.Ve
162.SH "DESCRIPTION"
163.IX Header "DESCRIPTION"
164.Sh "Normalization Forms"
165.IX Subsection "Normalization Forms"
166.ie n .IP """$NFD_string = NFD($string)""" 4
167.el .IP "\f(CW$NFD_string = NFD($string)\fR" 4
168.IX Item "$NFD_string = NFD($string)"
169returns the Normalization Form D (formed by canonical decomposition).
170.ie n .IP """$NFC_string = NFC($string)""" 4
171.el .IP "\f(CW$NFC_string = NFC($string)\fR" 4
172.IX Item "$NFC_string = NFC($string)"
173returns the Normalization Form C (formed by canonical decomposition
174followed by canonical composition).
175.ie n .IP """$NFKD_string = NFKD($string)""" 4
176.el .IP "\f(CW$NFKD_string = NFKD($string)\fR" 4
177.IX Item "$NFKD_string = NFKD($string)"
178returns the Normalization Form \s-1KD\s0 (formed by compatibility decomposition).
179.ie n .IP """$NFKC_string = NFKC($string)""" 4
180.el .IP "\f(CW$NFKC_string = NFKC($string)\fR" 4
181.IX Item "$NFKC_string = NFKC($string)"
182returns the Normalization Form \s-1KC\s0 (formed by compatibility decomposition
183followed by \fBcanonical\fR composition).
184.ie n .IP """$normalized_string = normalize($form_name, $string)""" 4
185.el .IP "\f(CW$normalized_string = normalize($form_name, $string)\fR" 4
186.IX Item "$normalized_string = normalize($form_name, $string)"
187As \f(CW$form_name\fR, one of the following names must be given.
188.Sp
189.Vb 4
190\& 'C' or 'NFC' for Normalization Form C
191\& 'D' or 'NFD' for Normalization Form D
192\& 'KC' or 'NFKC' for Normalization Form KC
193\& 'KD' or 'NFKD' for Normalization Form KD
194.Ve
195.Sh "Decomposition and Composition"
196.IX Subsection "Decomposition and Composition"
197.ie n .IP """$decomposed_string = decompose($string)""" 4
198.el .IP "\f(CW$decomposed_string = decompose($string)\fR" 4
199.IX Item "$decomposed_string = decompose($string)"
200.PD 0
201.ie n .IP """$decomposed_string = decompose($string, $useCompatMapping)""" 4
202.el .IP "\f(CW$decomposed_string = decompose($string, $useCompatMapping)\fR" 4
203.IX Item "$decomposed_string = decompose($string, $useCompatMapping)"
204.PD
205Decompose the specified string and returns the result.
206.Sp
207If the second parameter (a boolean) is omitted or false, decomposes it
208using the Canonical Decomposition Mapping.
209If true, decomposes it using the Compatibility Decomposition Mapping.
210.Sp
211The string returned is not always in \s-1NFD/NFKD\s0.
212Reordering may be required.
213.Sp
214.Vb 2
215\& $NFD_string = reorder(decompose($string)); # eq. to NFD()
216\& $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
217.Ve
218.ie n .IP """$reordered_string = reorder($string)""" 4
219.el .IP "\f(CW$reordered_string = reorder($string)\fR" 4
220.IX Item "$reordered_string = reorder($string)"
221Reorder the combining characters and the like in the canonical ordering
222and returns the result.
223.Sp
224E.g., when you have a list of \s-1NFD/NFKD\s0 strings,
225you can get the concatenated \s-1NFD/NFKD\s0 string from them, saying
226.Sp
227.Vb 2
228\& $concat_NFD = reorder(join '', @NFD_strings);
229\& $concat_NFKD = reorder(join '', @NFKD_strings);
230.Ve
231.ie n .IP """$composed_string = compose($string)""" 4
232.el .IP "\f(CW$composed_string = compose($string)\fR" 4
233.IX Item "$composed_string = compose($string)"
234Returns the string where composable pairs are composed.
235.Sp
236E.g., when you have a \s-1NFD/NFKD\s0 string,
237you can get its \s-1NFC/NFKC\s0 string, saying
238.Sp
239.Vb 2
240\& $NFC_string = compose($NFD_string);
241\& $NFKC_string = compose($NFKD_string);
242.Ve
243.Sh "Quick Check"
244.IX Subsection "Quick Check"
245(see Annex 8, \s-1UAX\s0 #15; \fIDerivedNormalizationProps.txt\fR)
246.PP
247The following functions check whether the string is in that normalization form.
248.PP
249The result returned will be:
250.PP
251.Vb 3
252\& YES The string is in that normalization form.
253\& NO The string is not in that normalization form.
254\& MAYBE Dubious. Maybe yes, maybe no.
255.Ve
256.ie n .IP """$result = checkNFD($string)""" 4
257.el .IP "\f(CW$result = checkNFD($string)\fR" 4
258.IX Item "$result = checkNFD($string)"
259returns \f(CW\*(C`YES\*(C'\fR (\f(CW1\fR) or \f(CW\*(C`NO\*(C'\fR (\f(CW\*(C`empty string\*(C'\fR).
260.ie n .IP """$result = checkNFC($string)""" 4
261.el .IP "\f(CW$result = checkNFC($string)\fR" 4
262.IX Item "$result = checkNFC($string)"
263returns \f(CW\*(C`YES\*(C'\fR (\f(CW1\fR), \f(CW\*(C`NO\*(C'\fR (\f(CW\*(C`empty string\*(C'\fR), or \f(CW\*(C`MAYBE\*(C'\fR (\f(CW\*(C`undef\*(C'\fR).
264.ie n .IP """$result = checkNFKD($string)""" 4
265.el .IP "\f(CW$result = checkNFKD($string)\fR" 4
266.IX Item "$result = checkNFKD($string)"
267returns \f(CW\*(C`YES\*(C'\fR (\f(CW1\fR) or \f(CW\*(C`NO\*(C'\fR (\f(CW\*(C`empty string\*(C'\fR).
268.ie n .IP """$result = checkNFKC($string)""" 4
269.el .IP "\f(CW$result = checkNFKC($string)\fR" 4
270.IX Item "$result = checkNFKC($string)"
271returns \f(CW\*(C`YES\*(C'\fR (\f(CW1\fR), \f(CW\*(C`NO\*(C'\fR (\f(CW\*(C`empty string\*(C'\fR), or \f(CW\*(C`MAYBE\*(C'\fR (\f(CW\*(C`undef\*(C'\fR).
272.ie n .IP """$result = check($form_name, $string)""" 4
273.el .IP "\f(CW$result = check($form_name, $string)\fR" 4
274.IX Item "$result = check($form_name, $string)"
275returns \f(CW\*(C`YES\*(C'\fR (\f(CW1\fR), \f(CW\*(C`NO\*(C'\fR (\f(CW\*(C`empty string\*(C'\fR), or \f(CW\*(C`MAYBE\*(C'\fR (\f(CW\*(C`undef\*(C'\fR).
276.Sp
277\&\f(CW$form_name\fR is alike to that for \f(CW\*(C`normalize()\*(C'\fR.
278.PP
279\&\fBNote\fR
280.PP
281In the cases of \s-1NFD\s0 and \s-1NFKD\s0, the answer must be either \f(CW\*(C`YES\*(C'\fR or \f(CW\*(C`NO\*(C'\fR.
282The answer \f(CW\*(C`MAYBE\*(C'\fR may be returned in the cases of \s-1NFC\s0 and \s-1NFKC\s0.
283.PP
284A \s-1MAYBE\-NFC/NFKC\s0 string should contain at least
285one combining character or the like.
286For example, \f(CW\*(C`COMBINING ACUTE ACCENT\*(C'\fR has
287the \s-1MAYBE_NFC/MAYBE_NFKC\s0 property.
288Both \f(CW\*(C`checkNFC("A\eN{COMBINING ACUTE ACCENT}")\*(C'\fR
289and \f(CW\*(C`checkNFC("B\eN{COMBINING ACUTE ACCENT}")\*(C'\fR will return \f(CW\*(C`MAYBE\*(C'\fR.
290\&\f(CW"A\eN{COMBINING ACUTE ACCENT}"\fR is not in \s-1NFC\s0
291(its \s-1NFC\s0 is \f(CW"\eN{LATIN CAPITAL LETTER A WITH ACUTE}"\fR),
292while \f(CW"B\eN{COMBINING ACUTE ACCENT}"\fR is in \s-1NFC\s0.
293.PP
294If you want to check exactly, compare the string with its \s-1NFC/NFKC\s0; i.e.,
295.PP
296.Vb 2
297\& $string eq NFC($string) # more thorough than checkNFC($string)
298\& $string eq NFKC($string) # more thorough than checkNFKC($string)
299.Ve
300.Sh "Character Data"
301.IX Subsection "Character Data"
302These functions are interface of character data used internally.
303If you want only to get Unicode normalization forms, you don't need
304call them yourself.
305.ie n .IP """$canonical_decomposed = getCanon($codepoint)""" 4
306.el .IP "\f(CW$canonical_decomposed = getCanon($codepoint)\fR" 4
307.IX Item "$canonical_decomposed = getCanon($codepoint)"
308If the character of the specified codepoint is canonically
309decomposable (including Hangul Syllables),
310returns the \fBcompletely decomposed\fR string canonically equivalent to it.
311.Sp
312If it is not decomposable, returns \f(CW\*(C`undef\*(C'\fR.
313.ie n .IP """$compatibility_decomposed = getCompat($codepoint)""" 4
314.el .IP "\f(CW$compatibility_decomposed = getCompat($codepoint)\fR" 4
315.IX Item "$compatibility_decomposed = getCompat($codepoint)"
316If the character of the specified codepoint is compatibility
317decomposable (including Hangul Syllables),
318returns the \fBcompletely decomposed\fR string compatibility equivalent to it.
319.Sp
320If it is not decomposable, returns \f(CW\*(C`undef\*(C'\fR.
321.ie n .IP """$codepoint_composite = getComposite($codepoint_here, $codepoint_next)""" 4
322.el .IP "\f(CW$codepoint_composite = getComposite($codepoint_here, $codepoint_next)\fR" 4
323.IX Item "$codepoint_composite = getComposite($codepoint_here, $codepoint_next)"
324If two characters here and next (as codepoints) are composable
325(including Hangul Jamo/Syllables and Composition Exclusions),
326returns the codepoint of the composite.
327.Sp
328If they are not composable, returns \f(CW\*(C`undef\*(C'\fR.
329.ie n .IP """$combining_class = getCombinClass($codepoint)""" 4
330.el .IP "\f(CW$combining_class = getCombinClass($codepoint)\fR" 4
331.IX Item "$combining_class = getCombinClass($codepoint)"
332Returns the combining class of the character as an integer.
333.ie n .IP """$is_exclusion = isExclusion($codepoint)""" 4
334.el .IP "\f(CW$is_exclusion = isExclusion($codepoint)\fR" 4
335.IX Item "$is_exclusion = isExclusion($codepoint)"
336Returns a boolean whether the character of the specified codepoint
337is a composition exclusion.
338.ie n .IP """$is_singleton = isSingleton($codepoint)""" 4
339.el .IP "\f(CW$is_singleton = isSingleton($codepoint)\fR" 4
340.IX Item "$is_singleton = isSingleton($codepoint)"
341Returns a boolean whether the character of the specified codepoint is
342a singleton.
343.ie n .IP """$is_non_startar_decomposition = isNonStDecomp($codepoint)""" 4
344.el .IP "\f(CW$is_non_startar_decomposition = isNonStDecomp($codepoint)\fR" 4
345.IX Item "$is_non_startar_decomposition = isNonStDecomp($codepoint)"
346Returns a boolean whether the canonical decomposition
347of the character of the specified codepoint
348is a Non-Starter Decomposition.
349.ie n .IP """$may_be_composed_with_prev_char = isComp2nd($codepoint)""" 4
350.el .IP "\f(CW$may_be_composed_with_prev_char = isComp2nd($codepoint)\fR" 4
351.IX Item "$may_be_composed_with_prev_char = isComp2nd($codepoint)"
352Returns a boolean whether the character of the specified codepoint
353may be composed with the previous one in a certain composition
354(including Hangul Compositions, but excluding
355Composition Exclusions and Non-Starter Decompositions).
356.Sh "\s-1EXPORT\s0"
357.IX Subsection "EXPORT"
358\&\f(CW\*(C`NFC\*(C'\fR, \f(CW\*(C`NFD\*(C'\fR, \f(CW\*(C`NFKC\*(C'\fR, \f(CW\*(C`NFKD\*(C'\fR: by default.
359.PP
360\&\f(CW\*(C`normalize\*(C'\fR and other some functions: on request.
361.SH "AUTHOR"
362.IX Header "AUTHOR"
363\&\s-1SADAHIRO\s0 Tomoyuki, <SADAHIRO@cpan.org>
364.PP
365.Vb 1
366\& http://homepage1.nifty.com/nomenclator/perl/
367.Ve
368.PP
369.Vb 1
370\& Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
371.Ve
372.PP
373.Vb 2
374\& This program is free software; you can redistribute it and/or
375\& modify it under the same terms as Perl itself.
376.Ve
377.SH "SEE ALSO"
378.IX Header "SEE ALSO"
379.IP "http://www.unicode.org/unicode/reports/tr15/" 4
380.IX Item "http://www.unicode.org/unicode/reports/tr15/"
381Unicode Normalization Forms \- \s-1UAX\s0 #15
382.IP "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt" 4
383.IX Item "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt"
384Derived Normalization Properties