42 #define U_UNICODE_VERSION "6.2"
124 #define UCHAR_MIN_VALUE 0
134 #define UCHAR_MAX_VALUE 0x10ffff
140 #define U_MASK(x) ((uint32_t)1<<(x))
664 #define U_GC_CN_MASK U_MASK(U_GENERAL_OTHER_TYPES)
667 #define U_GC_LU_MASK U_MASK(U_UPPERCASE_LETTER)
669 #define U_GC_LL_MASK U_MASK(U_LOWERCASE_LETTER)
671 #define U_GC_LT_MASK U_MASK(U_TITLECASE_LETTER)
673 #define U_GC_LM_MASK U_MASK(U_MODIFIER_LETTER)
675 #define U_GC_LO_MASK U_MASK(U_OTHER_LETTER)
678 #define U_GC_MN_MASK U_MASK(U_NON_SPACING_MARK)
680 #define U_GC_ME_MASK U_MASK(U_ENCLOSING_MARK)
682 #define U_GC_MC_MASK U_MASK(U_COMBINING_SPACING_MARK)
685 #define U_GC_ND_MASK U_MASK(U_DECIMAL_DIGIT_NUMBER)
687 #define U_GC_NL_MASK U_MASK(U_LETTER_NUMBER)
689 #define U_GC_NO_MASK U_MASK(U_OTHER_NUMBER)
692 #define U_GC_ZS_MASK U_MASK(U_SPACE_SEPARATOR)
694 #define U_GC_ZL_MASK U_MASK(U_LINE_SEPARATOR)
696 #define U_GC_ZP_MASK U_MASK(U_PARAGRAPH_SEPARATOR)
699 #define U_GC_CC_MASK U_MASK(U_CONTROL_CHAR)
701 #define U_GC_CF_MASK U_MASK(U_FORMAT_CHAR)
703 #define U_GC_CO_MASK U_MASK(U_PRIVATE_USE_CHAR)
705 #define U_GC_CS_MASK U_MASK(U_SURROGATE)
708 #define U_GC_PD_MASK U_MASK(U_DASH_PUNCTUATION)
710 #define U_GC_PS_MASK U_MASK(U_START_PUNCTUATION)
712 #define U_GC_PE_MASK U_MASK(U_END_PUNCTUATION)
714 #define U_GC_PC_MASK U_MASK(U_CONNECTOR_PUNCTUATION)
716 #define U_GC_PO_MASK U_MASK(U_OTHER_PUNCTUATION)
719 #define U_GC_SM_MASK U_MASK(U_MATH_SYMBOL)
721 #define U_GC_SC_MASK U_MASK(U_CURRENCY_SYMBOL)
723 #define U_GC_SK_MASK U_MASK(U_MODIFIER_SYMBOL)
725 #define U_GC_SO_MASK U_MASK(U_OTHER_SYMBOL)
728 #define U_GC_PI_MASK U_MASK(U_INITIAL_PUNCTUATION)
730 #define U_GC_PF_MASK U_MASK(U_FINAL_PUNCTUATION)
734 #define U_GC_L_MASK \
735 (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK)
738 #define U_GC_LC_MASK \
739 (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK)
742 #define U_GC_M_MASK (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK)
745 #define U_GC_N_MASK (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK)
748 #define U_GC_Z_MASK (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK)
751 #define U_GC_C_MASK \
752 (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK)
755 #define U_GC_P_MASK \
756 (U_GC_PD_MASK|U_GC_PS_MASK|U_GC_PE_MASK|U_GC_PC_MASK|U_GC_PO_MASK| \
757 U_GC_PI_MASK|U_GC_PF_MASK)
760 #define U_GC_S_MASK (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK)
1488 U_SHORT_PROPERTY_NAME,
1489 U_LONG_PROPERTY_NAME,
1490 U_PROPERTY_NAME_CHOICE_COUNT
1562 U_JG_NO_JOINING_GROUP,
1647 U_GCB_SPACING_MARK = 10,
1649 U_GCB_REGIONAL_INDICATOR = 12,
1674 U_WB_EXTENDNUMLET = 7,
1680 U_WB_REGIONAL_INDICATOR = 13,
1711 U_SB_SCONTINUE = 14,
1730 U_LB_ALPHABETIC = 2,
1731 U_LB_BREAK_BOTH = 3,
1732 U_LB_BREAK_AFTER = 4,
1733 U_LB_BREAK_BEFORE = 5,
1734 U_LB_MANDATORY_BREAK = 6,
1735 U_LB_CONTINGENT_BREAK = 7,
1736 U_LB_CLOSE_PUNCTUATION = 8,
1737 U_LB_COMBINING_MARK = 9,
1738 U_LB_CARRIAGE_RETURN = 10,
1739 U_LB_EXCLAMATION = 11,
1742 U_LB_IDEOGRAPHIC = 14,
1746 U_LB_INFIX_NUMERIC = 16,
1747 U_LB_LINE_FEED = 17,
1748 U_LB_NONSTARTER = 18,
1750 U_LB_OPEN_PUNCTUATION = 20,
1751 U_LB_POSTFIX_NUMERIC = 21,
1752 U_LB_PREFIX_NUMERIC = 22,
1753 U_LB_QUOTATION = 23,
1754 U_LB_COMPLEX_CONTEXT = 24,
1755 U_LB_SURROGATE = 25,
1757 U_LB_BREAK_SYMBOLS = 27,
1759 U_LB_NEXT_LINE = 29,
1760 U_LB_WORD_JOINER = 30,
1766 U_LB_CLOSE_PARENTHESIS = 36,
1767 U_LB_CONDITIONAL_JAPANESE_STARTER = 37,
1768 U_LB_HEBREW_LETTER = 38,
1769 U_LB_REGIONAL_INDICATOR = 39,
1806 U_HST_NOT_APPLICABLE,
1809 U_HST_TRAILING_JAMO,
2032 #define U_NO_NUMERIC_VALUE ((double)-123456789.)
2525 #define U_GET_GC_MASK(c) U_MASK(u_charType(c))
2569 #if !UCONFIG_NO_NORMALIZATION
2655 char *buffer, int32_t bufferLength,
2678 char *dest, int32_t destCapacity,
3084 #define U_FOLD_CASE_DEFAULT 0
3102 #define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
3232 #if !UCONFIG_NO_NORMALIZATION
uint8_t UVersionInfo[U_MAX_VERSION_LENGTH]
The binary form of a version on ICU APIs is an array of 4 uint8_t.
Binary property Ideographic.
Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
Binary property Changes_When_Lowercased.
Binary property IDS_Binary_Operator (new in Unicode 3.2).
Binary property Case_Ignorable.
UBool u_isUWhiteSpace(UChar32 c)
Check if a code point has the White_Space Unicode property.
UBool u_istitle(UChar32 c)
Determines whether the specified code point is a titlecase letter.
Enumerated property NFC_Quick_Check.
UChar32 u_totitle(UChar32 c)
The given character is mapped to its titlecase equivalent according to UnicodeData.txt; if none is defined, the character itself is returned.
Provisional property Script_Extensions (new in Unicode 6.0).
const char * u_getPropertyName(UProperty property, UPropertyNameChoice nameChoice)
Return the Unicode name for a given property, as given in the Unicode database file PropertyAliases...
Same as UBLOCK_PRIVATE_USE_AREA.
UChar32 u_foldCase(UChar32 c, uint32_t options)
The given character is mapped to its case folding equivalent according to UnicodeData.txt and CaseFolding.txt; if the character has no case folding equivalent, the character itself is returned.
First constant for enumerated/integer Unicode properties.
Binary property XID_Start.
Binary property Join_Control.
Binary property Logical_Order_Exception (new in Unicode 3.2).
Binary property White_Space.
String property Titlecase_Mapping.
One more than the last constant for enumerated/integer Unicode properties.
Enumerated property Numeric_Type.
Binary property xdigit (a C/POSIX character class).
UBlockCode ublock_getCode(UChar32 c)
Returns the Unicode allocation block that contains the character.
Binary property Alphabetic.
First constant for double Unicode properties.
UBool UEnumCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)
Type of a callback function for u_enumCharNames() that gets called for each Unicode character with th...
UBool u_isgraph(UChar32 c)
Determines whether the specified code point is a "graphic" character (printable, excluding spaces)...
Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNE...
String property Simple_Case_Folding.
Binary property NFC_Inert.
Binary property graph (a C/POSIX character class).
String property Bidi_Mirroring_Glyph.
One more than the last constant for bit-mask Unicode properties.
UBool u_isdefined(UChar32 c)
Determines whether the specified code point is "defined", which usually means that it is assigned a c...
Enumerated property Block.
Represents a nonexistent or invalid property or property value.
Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0.
Binary property Radical (new in Unicode 3.2).
UCharDirection
This specifies the language directional property of a character set.
Binary property IDS_Trinary_Operator (new in Unicode 3.2).
Binary property Grapheme_Link (new in Unicode 3.2).
Enumerated property Decomposition_Type.
String property Case_Folding.
String property Simple_Uppercase_Mapping.
UChar32 u_forDigit(int32_t digit, int8_t radix)
Determines the character representation for a specific digit in the specified radix.
Enumerated property Bidi_Class.
int32_t u_charDigitValue(UChar32 c)
Returns the decimal digit value of a decimal digit character.
Enumerated property General_Category.
String property Unicode_1_Name.
int32_t u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode)
Get the FC_NFKC_Closure property string for a character.
UNumericType
Numeric Type constants.
Binary property Pattern_White_Space (new in Unicode 4.1).
UBool u_iscntrl(UChar32 c)
Determines whether the specified code point is a control character (as defined by this function)...
UBool UCharEnumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type)
Callback from u_enumCharTypes(), is called for each contiguous range of code points c (where start<=c...
Binary property Changes_When_Casefolded.
Binary property NFD_Inert.
Binary property Diacritic.
Binary property Terminal_Punctuation.
UChar32 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode)
Find a Unicode character by its name and return its code point value.
UBool u_isUAlphabetic(UChar32 c)
Check if a code point has the Alphabetic Unicode property.
Enumerated property NFD_Quick_Check.
void u_charAge(UChar32 c, UVersionInfo versionArray)
Get the "age" of the code point.
int32_t u_getPropertyValueEnum(UProperty property, const char *alias)
Return the property value integer for a given value name, as specified in the Unicode database file P...
#define U_CDECL_BEGIN
This is used to begin a declaration of a library private ICU C API.
Binary property STerm (new in Unicode 4.0.1).
Enumerated property Joining_Group.
Binary property ID_Continue.
Binary property blank (a C/POSIX character class).
Binary property Quotation_Mark.
Binary property Changes_When_NFKC_Casefolded.
First constant for binary Unicode properties.
Binary property Noncharacter_Code_Point.
Enumerated property East_Asian_Width.
ULineBreak
Line Break constants.
Binary property Full_Composition_Exclusion.
Bitmask property General_Category_Mask.
String property Simple_Titlecase_Mapping.
Unicode 3.2 renames this block to "Combining Diacritical Marks for Symbols".
int32_t u_digit(UChar32 ch, int8_t radix)
Returns the decimal digit value of the code point in the specified radix.
UDecompositionType
Decomposition Type constants.
UBool u_isprint(UChar32 c)
Determines whether the specified code point is a printable character.
UBool u_isxdigit(UChar32 c)
Determines whether the specified code point is a hexadecimal digit.
UHangulSyllableType
Hangul Syllable Type constants.
String property Simple_Lowercase_Mapping.
Binary property print (a C/POSIX character class).
Binary property Case_Sensitive.
Standard or synthetic character name.
Binary property Bidi_Mirrored.
Binary property NFKC_Inert.
int32_t u_getIntPropertyValue(UChar32 c, UProperty which)
Get the property value for an enumerated or integer Unicode property for a code point.
Binary property Changes_When_Casemapped.
First constant for string Unicode properties.
Binary property Grapheme_Extend (new in Unicode 3.2).
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
UGraphemeClusterBreak
Grapheme Cluster Break constants.
New No_Block value in Unicode 4.
Binary property Extender.
Double property Numeric_Value.
Unicode character name (Name property).
Binary property Grapheme_Base (new in Unicode 3.2).
Binary property NFKD_Inert.
uint8_t u_getCombiningClass(UChar32 c)
Returns the combining class of the code point as specified in UnicodeData.txt.
UBool u_isbase(UChar32 c)
Determines whether the specified code point is a base character.
UCharCategory
Data for enumerated Unicode general category types.
const char * u_getPropertyValueName(UProperty property, int32_t value, UPropertyNameChoice nameChoice)
Return the Unicode name for a given property value, as given in the Unicode database file PropertyVal...
Enumerated property Sentence_Break (new in Unicode 4.1).
double u_getNumericValue(UChar32 c)
Get the numeric value for a Unicode code point as defined in the Unicode Character Database...
Binary property Lowercase.
UBool u_isJavaIDStart(UChar32 c)
Determines if the specified character is permissible as the first character in a Java identifier...
First constant for bit-mask Unicode properties.
UBool u_isspace(UChar32 c)
Determines if the specified character is a space character or not.
USentenceBreak
Sentence Break constants.
Binary property Unified_Ideograph (new in Unicode 3.2).
Enumerated property Canonical_Combining_Class.
UCharNameChoice
Selector constants for u_charName().
One more than the last constant for binary Unicode properties.
UBool u_isJavaIDPart(UChar32 c)
Determines if the specified character is permissible in a Java identifier.
Enumerated property Script.
Unicode 3.2 renames this block to "Greek and Coptic".
Binary property Hex_Digit.
String property Uppercase_Mapping.
UPropertyNameChoice
Selector constants for u_getPropertyName() and u_getPropertyValueName().
String property Lowercase_Mapping.
UCharDirection u_charDirection(UChar32 c)
Returns the bidirectional category value for the code point, which is used in the Unicode bidirection...
UBool u_islower(UChar32 c)
Determines whether the specified code point has the general category "Ll" (lowercase letter)...
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
#define U_CDECL_END
This is used to end a declaration of a library private ICU C API.
Enumerated property NFKC_Quick_Check.
UProperty
Selection constants for Unicode properties.
void u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context)
Enumerate efficiently all code points with their Unicode general categories.
Enumerated property Hangul_Syllable_Type, new in Unicode 4.
Binary property alnum (a C/POSIX character class).
Binary property Variation_Selector (new in Unicode 4.0.1).
UBool u_isUUppercase(UChar32 c)
Check if a code point has the Uppercase Unicode property.
UBlockCode
Constants for Unicode blocks, see the Unicode Data file Blocks.txt.
Enumerated property Word_Break (new in Unicode 4.1).
Binary property Deprecated (new in Unicode 3.2).
Binary property Bidi_Control.
Binary property XID_Continue.
Same as UBLOCK_PRIVATE_USE.
UBool u_hasBinaryProperty(UChar32 c, UProperty which)
Check a binary Unicode property for a code point.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Binary property Uppercase.
void u_getUnicodeVersion(UVersionInfo versionArray)
Gets the Unicode version information.
Binary property Changes_When_Uppercased.
UJoiningGroup
Joining Group constants.
UEastAsianWidth
East Asian Width constants.
UBool u_isupper(UChar32 c)
Determines whether the specified code point has the general category "Lu" (uppercase letter)...
Enumerated property Trail_Canonical_Combining_Class.
UBool u_isULowercase(UChar32 c)
Check if a code point has the Lowercase Unicode property.
Non-category for unassigned and non-character code points.
UBool u_ispunct(UChar32 c)
Determines whether the specified code point is a punctuation character.
First constant for Unicode properties with unusual value types.
UWordBreakValues
Word Break constants.
void u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode)
Enumerate all assigned Unicode characters between the start and limit code points (start inclusive...
int32_t u_getIntPropertyMaxValue(UProperty which)
Get the maximum value for an enumerated/integer/binary Unicode property.
Enumerated property Joining_Type.
One more than the last constant for double Unicode properties.
Basic definitions for ICU, for both C and C++ APIs.
UBool u_isIDPart(UChar32 c)
Determines if the specified character is permissible in an identifier according to Java...
Enumerated property Lead_Canonical_Combining_Class.
Binary property ASCII_Hex_Digit.
UJoiningType
Joining Type constants.
UBool u_isIDStart(UChar32 c)
Determines if the specified character is permissible as the first character in an identifier accordin...
Binary property Soft_Dotted (new in Unicode 3.2).
One more than the last constant for string Unicode properties.
Binary Property Segment_Starter.
UChar32 u_toupper(UChar32 c)
The given character is mapped to its uppercase equivalent according to UnicodeData.txt; if the character has no uppercase equivalent, the character itself is returned.
Binary property ID_Start.
int32_t u_getIntPropertyMinValue(UProperty which)
Get the minimum value for an enumerated/integer/binary Unicode property.
UBool u_isJavaSpaceChar(UChar32 c)
Determine if the specified code point is a space character according to Java.
UBool u_isMirrored(UChar32 c)
Determines whether the code point has the Bidi_Mirrored property.
Corrected name from NameAliases.txt.
Binary property Changes_When_Titlecased.
UChar32 u_tolower(UChar32 c)
The given character is mapped to its lowercase equivalent according to UnicodeData.txt; if the character has no lowercase equivalent, the character itself is returned.
Enumerated property Line_Break.
UBool u_isIDIgnorable(UChar32 c)
Determines if the specified character should be regarded as an ignorable character in an identifier...
Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1).
UBool u_isdigit(UChar32 c)
Determines whether the specified code point is a digit character according to Java.
UProperty u_getPropertyEnum(const char *alias)
Return the UProperty enum for a given property name, as specified in the Unicode database file Proper...
UBool u_isWhitespace(UChar32 c)
Determines if the specified code point is a whitespace character according to Java/ICU.
Deprecated string property ISO_Comment.
UBool u_isblank(UChar32 c)
Determines whether the specified code point is a "blank" or "horizontal space", a character that visi...
UBool u_isalpha(UChar32 c)
Determines whether the specified code point is a letter character.
Binary property Default_Ignorable_Code_Point (new in Unicode 3.2).
The Unicode_1_Name property value which is of little practical value.
int32_t u_getISOComment(UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode)
Returns an empty string.
One higher than the last enum UCharCategory constant.
UBool u_isalnum(UChar32 c)
Determines whether the specified code point is an alphanumeric character (letter or digit) according ...
Binary property Pattern_Syntax (new in Unicode 4.1).
One more than the last constant for Unicode properties with unusual value types.
UChar32 u_charMirror(UChar32 c)
Maps the specified character to a "mirror-image" character.
UBool u_isISOControl(UChar32 c)
Determines whether the specified code point is an ISO control code.
int8_t u_charType(UChar32 c)
Returns the general category value for the code point.
int32_t u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode)
Retrieve the name of a Unicode character.
#define U_STABLE
This is used to declare a function as a stable public ICU C API.
int8_t UBool
The ICU boolean type.
Enumerated property NFKD_Quick_Check.