-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathunicode_composition_exclusions.pl
225 lines (210 loc) · 10.9 KB
/
unicode_composition_exclusions.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% This file is part of VivoMind Prolog Unicode Resources
%
% VivoMind Prolog Unicode Resources is free software distributed using the
% Creative Commons CC0 1.0 Universal (CC0 1.0) - Public Domain Dedication
% license
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Last modified: March 13, 2012
%
% Original Unicode file header comments follow
/*
# CompositionExclusions-6.1.0.txt
# Date: 2011-07-12, 00:13:00 GMT [KW, LI]
#
# This file lists the characters for the Composition Exclusion Table
# defined in UAX #15, Unicode Normalization Forms.
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 1991-2011 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# For more information, see
# http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
#
# For a full derivation of composition exclusions, see the derived property
# Full_Composition_Exclusion in DerivedNormalizationProps.txt
#
# ================================================
# (1) Script Specifics
#
# This list of characters cannot be derived from the UnicodeData.txt file.
# ================================================
*/
unicode_composition_exclusion(0x0958). % DEVANAGARI LETTER QA
unicode_composition_exclusion(0x0959). % DEVANAGARI LETTER KHHA
unicode_composition_exclusion(0x095A). % DEVANAGARI LETTER GHHA
unicode_composition_exclusion(0x095B). % DEVANAGARI LETTER ZA
unicode_composition_exclusion(0x095C). % DEVANAGARI LETTER DDDHA
unicode_composition_exclusion(0x095D). % DEVANAGARI LETTER RHA
unicode_composition_exclusion(0x095E). % DEVANAGARI LETTER FA
unicode_composition_exclusion(0x095F). % DEVANAGARI LETTER YYA
unicode_composition_exclusion(0x09DC). % BENGALI LETTER RRA
unicode_composition_exclusion(0x09DD). % BENGALI LETTER RHA
unicode_composition_exclusion(0x09DF). % BENGALI LETTER YYA
unicode_composition_exclusion(0x0A33). % GURMUKHI LETTER LLA
unicode_composition_exclusion(0x0A36). % GURMUKHI LETTER SHA
unicode_composition_exclusion(0x0A59). % GURMUKHI LETTER KHHA
unicode_composition_exclusion(0x0A5A). % GURMUKHI LETTER GHHA
unicode_composition_exclusion(0x0A5B). % GURMUKHI LETTER ZA
unicode_composition_exclusion(0x0A5E). % GURMUKHI LETTER FA
unicode_composition_exclusion(0x0B5C). % ORIYA LETTER RRA
unicode_composition_exclusion(0x0B5D). % ORIYA LETTER RHA
unicode_composition_exclusion(0x0F43). % TIBETAN LETTER GHA
unicode_composition_exclusion(0x0F4D). % TIBETAN LETTER DDHA
unicode_composition_exclusion(0x0F52). % TIBETAN LETTER DHA
unicode_composition_exclusion(0x0F57). % TIBETAN LETTER BHA
unicode_composition_exclusion(0x0F5C). % TIBETAN LETTER DZHA
unicode_composition_exclusion(0x0F69). % TIBETAN LETTER KSSA
unicode_composition_exclusion(0x0F76). % TIBETAN VOWEL SIGN VOCALIC R
unicode_composition_exclusion(0x0F78). % TIBETAN VOWEL SIGN VOCALIC L
unicode_composition_exclusion(0x0F93). % TIBETAN SUBJOINED LETTER GHA
unicode_composition_exclusion(0x0F9D). % TIBETAN SUBJOINED LETTER DDHA
unicode_composition_exclusion(0x0FA2). % TIBETAN SUBJOINED LETTER DHA
unicode_composition_exclusion(0x0FA7). % TIBETAN SUBJOINED LETTER BHA
unicode_composition_exclusion(0x0FAC). % TIBETAN SUBJOINED LETTER DZHA
unicode_composition_exclusion(0x0FB9). % TIBETAN SUBJOINED LETTER KSSA
unicode_composition_exclusion(0xFB1D). % HEBREW LETTER YOD WITH HIRIQ
unicode_composition_exclusion(0xFB1F). % HEBREW LIGATURE YIDDISH YOD YOD PATAH
unicode_composition_exclusion(0xFB2A). % HEBREW LETTER SHIN WITH SHIN DOT
unicode_composition_exclusion(0xFB2B). % HEBREW LETTER SHIN WITH SIN DOT
unicode_composition_exclusion(0xFB2C). % HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
unicode_composition_exclusion(0xFB2D). % HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
unicode_composition_exclusion(0xFB2E). % HEBREW LETTER ALEF WITH PATAH
unicode_composition_exclusion(0xFB2F). % HEBREW LETTER ALEF WITH QAMATS
unicode_composition_exclusion(0xFB30). % HEBREW LETTER ALEF WITH MAPIQ
unicode_composition_exclusion(0xFB31). % HEBREW LETTER BET WITH DAGESH
unicode_composition_exclusion(0xFB32). % HEBREW LETTER GIMEL WITH DAGESH
unicode_composition_exclusion(0xFB33). % HEBREW LETTER DALET WITH DAGESH
unicode_composition_exclusion(0xFB34). % HEBREW LETTER HE WITH MAPIQ
unicode_composition_exclusion(0xFB35). % HEBREW LETTER VAV WITH DAGESH
unicode_composition_exclusion(0xFB36). % HEBREW LETTER ZAYIN WITH DAGESH
unicode_composition_exclusion(0xFB38). % HEBREW LETTER TET WITH DAGESH
unicode_composition_exclusion(0xFB39). % HEBREW LETTER YOD WITH DAGESH
unicode_composition_exclusion(0xFB3A). % HEBREW LETTER FINAL KAF WITH DAGESH
unicode_composition_exclusion(0xFB3B). % HEBREW LETTER KAF WITH DAGESH
unicode_composition_exclusion(0xFB3C). % HEBREW LETTER LAMED WITH DAGESH
unicode_composition_exclusion(0xFB3E). % HEBREW LETTER MEM WITH DAGESH
unicode_composition_exclusion(0xFB40). % HEBREW LETTER NUN WITH DAGESH
unicode_composition_exclusion(0xFB41). % HEBREW LETTER SAMEKH WITH DAGESH
unicode_composition_exclusion(0xFB43). % HEBREW LETTER FINAL PE WITH DAGESH
unicode_composition_exclusion(0xFB44). % HEBREW LETTER PE WITH DAGESH
unicode_composition_exclusion(0xFB46). % HEBREW LETTER TSADI WITH DAGESH
unicode_composition_exclusion(0xFB47). % HEBREW LETTER QOF WITH DAGESH
unicode_composition_exclusion(0xFB48). % HEBREW LETTER RESH WITH DAGESH
unicode_composition_exclusion(0xFB49). % HEBREW LETTER SHIN WITH DAGESH
unicode_composition_exclusion(0xFB4A). % HEBREW LETTER TAV WITH DAGESH
unicode_composition_exclusion(0xFB4B). % HEBREW LETTER VAV WITH HOLAM
unicode_composition_exclusion(0xFB4C). % HEBREW LETTER BET WITH RAFE
unicode_composition_exclusion(0xFB4D). % HEBREW LETTER KAF WITH RAFE
unicode_composition_exclusion(0xFB4E). % HEBREW LETTER PE WITH RAFE
% Total code points: 67
/*
# ================================================
# (2) Post Composition Version precomposed characters
#
# These characters cannot be derived solely from the UnicodeData.txt file
# in this version of Unicode.
#
# Note that characters added to the standard after the
# Composition Version and which have canonical decomposition mappings
# are not automatically added to this list of Post Composition
# Version precomposed characters.
# ================================================
*/
unicode_composition_exclusion(0x2ADC). % FORKING
unicode_composition_exclusion(0x1D15E). % MUSICAL SYMBOL HALF NOTE
unicode_composition_exclusion(0x1D15F). % MUSICAL SYMBOL QUARTER NOTE
unicode_composition_exclusion(0x1D160). % MUSICAL SYMBOL EIGHTH NOTE
unicode_composition_exclusion(0x1D161). % MUSICAL SYMBOL SIXTEENTH NOTE
unicode_composition_exclusion(0x1D162). % MUSICAL SYMBOL THIRTY-SECOND NOTE
unicode_composition_exclusion(0x1D163). % MUSICAL SYMBOL SIXTY-FOURTH NOTE
unicode_composition_exclusion(0x1D164). % MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
unicode_composition_exclusion(0x1D1BB). % MUSICAL SYMBOL MINIMA
unicode_composition_exclusion(0x1D1BC). % MUSICAL SYMBOL MINIMA BLACK
unicode_composition_exclusion(0x1D1BD). % MUSICAL SYMBOL SEMIMINIMA WHITE
unicode_composition_exclusion(0x1D1BE). % MUSICAL SYMBOL SEMIMINIMA BLACK
unicode_composition_exclusion(0x1D1BF). % MUSICAL SYMBOL FUSA WHITE
unicode_composition_exclusion(0x1D1C0). % MUSICAL SYMBOL FUSA BLACK
% Total code points: 14
/*
# ================================================
# (3) Singleton Decompositions
#
# These characters can be derived from the UnicodeData.txt file
# by including all canonically decomposable characters whose
# canonical decomposition consists of a single character.
#
# These characters are simply quoted here for reference.
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
# ================================================
# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
# 0343 COMBINING GREEK KORONIS
# 0374 GREEK NUMERAL SIGN
# 037E GREEK QUESTION MARK
# 0387 GREEK ANO TELEIA
# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
# 1F75 GREEK SMALL LETTER ETA WITH OXIA
# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
# 1FBE GREEK PROSGEGRAMMENI
# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
# 1FFD GREEK OXIA
# 2000..2001 [2] EN QUAD..EM QUAD
# 2126 OHM SIGN
# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
# 2329 LEFT-POINTING ANGLE BRACKET
# 232A RIGHT-POINTING ANGLE BRACKET
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
# FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
# Total code points: 1035
# ================================================
# (4) Non-Starter Decompositions
#
# These characters can be derived from the UnicodeData.txt file
# by including each expanding canonical decomposition
# (i.e., those which canonically decompose to a sequence
# of characters instead of a single character), such that:
#
# A. The character is not a Starter.
#
# OR (inclusive)
#
# B. The character's canonical decomposition begins
# with a character that is not a Starter.
#
# Note that a "Starter" is any character with a zero combining class.
#
# These characters are simply quoted here for reference.
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
# ================================================
# 0344 COMBINING GREEK DIALYTIKA TONOS
# 0F73 TIBETAN VOWEL SIGN II
# 0F75 TIBETAN VOWEL SIGN UU
# 0F81 TIBETAN VOWEL SIGN REVERSED II
# Total code points: 4
*/