phonenumbers.unicode_util
index
phonenumbers/unicode_util.py

Unicode utility functions
 
>>> from .import unicode_util
>>> from .util import u
>>> u1 = '1'  # DIGIT ONE
>>> u2 = u('a')  # LATIN SMALL LETTER A
>>> u3 = u('๏ผ’')  # FULLWIDTH DIGIT TWO
>>> u4 = u('ฤ€')  # LATIN CAPITAL LETTER A WITH MACRON
>>> unicode_util.Category.get(u1) == u('Nd')
True
>>> unicode_util.Category.get(u2) == u('Ll')
True
>>> unicode_util.Category.get(u3) == u('Nd')
True
>>> unicode_util.Category.get(u4) == u('Lu')
True
>>> unicode_util.Category.get(u2) == unicode_util.Category.LOWERCASE_LETTER
True
>>> try:
...     beyond_bmp = u('๐„€')  # AEGEAN WORD SEPARATOR LINE
... except Exception:
...     beyond_bmp = u('')
>>> if len(beyond_bmp) == 1:  # We have a UCS4 build of Python
...     cat_po = unicode_util.Category.get(beyond_bmp)
... else:  # UCS2 build of Python; no non-BMP chars available
...     cat_po = unicode_util.Category.OTHER_PUNCTUATION
>>> cat_po == u('Po')
True
>>> unicode_util.is_letter(u1)
False
>>> unicode_util.is_letter(u2)
True
>>> unicode_util.is_letter(u3)
False
>>> unicode_util.is_letter(u4)
True
>>> b1 = unicode_util.Block.get(u1)
>>> str(b1)
'Block[0000, 007f]'
>>> b1 == unicode_util.Block.BASIC_LATIN
True
>>> b1 == [0x0000, 0x0075]
False
>>> b2 = unicode_util.Block.get(u2)
>>> b2 == unicode_util.Block.BASIC_LATIN
True
>>> b3 = unicode_util.Block.get(u3)
>>> b3 != unicode_util.Block.BASIC_LATIN
True
>>> b3 == unicode_util.Block.HALFWIDTH_AND_FULLWIDTH_FORMS
True
>>> b4 = unicode_util.Block.get(u4)
>>> b4 == unicode_util.Block.LATIN_EXTENDED_A
True
>>> unicode_util.Block.get(u('เก ')) == unicode_util.Block.UNKNOWN
True
>>> try:
...     unknown_block = u('๐“ฐ')
... except Exception:
...     unknown_block = u('')
>>> if len(unknown_block) == 1:  # We have a UCS4 build of Python
...     unicode_util.Block.get(u('๐“ฐ')) == unicode_util.Block.UNKNOWN
... else:  # UCS2 build of Python; no unknown characters available
...     True
True
>>> unicode_util.digit(u1)
1
>>> unicode_util.digit(u2, -1)
-1
>>> unicode_util.digit(u3, -1)
2
>>> str(hash(b3))  # doctest: +ELLIPSIS
'...'

 
Modules
       
bisect
unicodedata

 
Classes
       
builtins.object
Block
Category

 
class Block(builtins.object)
    Description of the possible Unicode blocks
 
  Class methods defined here:
get(uni_char) from builtins.type
Return the Unicode block of the given Unicode character

Data descriptors defined here:
__dict__
dictionary for instance variables (if defined)
__weakref__
list of weak references to the object (if defined)

Data and other attributes defined here:
AEGEAN_NUMBERS = <phonenumbers.unicode_util._BlockRange object>
ALCHEMICAL_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
ALPHABETIC_PRESENTATION_FORMS = <phonenumbers.unicode_util._BlockRange object>
ANCIENT_GREEK_MUSICAL_NOTATION = <phonenumbers.unicode_util._BlockRange object>
ANCIENT_GREEK_NUMBERS = <phonenumbers.unicode_util._BlockRange object>
ANCIENT_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
ARABIC = <phonenumbers.unicode_util._BlockRange object>
ARABIC_PRESENTATION_FORMS_A = <phonenumbers.unicode_util._BlockRange object>
ARABIC_PRESENTATION_FORMS_B = <phonenumbers.unicode_util._BlockRange object>
ARABIC_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
ARMENIAN = <phonenumbers.unicode_util._BlockRange object>
ARROWS = <phonenumbers.unicode_util._BlockRange object>
AVESTAN = <phonenumbers.unicode_util._BlockRange object>
BALINESE = <phonenumbers.unicode_util._BlockRange object>
BAMUM = <phonenumbers.unicode_util._BlockRange object>
BAMUM_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
BASIC_LATIN = <phonenumbers.unicode_util._BlockRange object>
BATAK = <phonenumbers.unicode_util._BlockRange object>
BENGALI = <phonenumbers.unicode_util._BlockRange object>
BLOCK_ELEMENTS = <phonenumbers.unicode_util._BlockRange object>
BOPOMOFO = <phonenumbers.unicode_util._BlockRange object>
BOPOMOFO_EXTENDED = <phonenumbers.unicode_util._BlockRange object>
BOX_DRAWING = <phonenumbers.unicode_util._BlockRange object>
BRAHMI = <phonenumbers.unicode_util._BlockRange object>
BRAILLE_PATTERNS = <phonenumbers.unicode_util._BlockRange object>
BUGINESE = <phonenumbers.unicode_util._BlockRange object>
BUHID = <phonenumbers.unicode_util._BlockRange object>
BYZANTINE_MUSICAL_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
CARIAN = <phonenumbers.unicode_util._BlockRange object>
CHAM = <phonenumbers.unicode_util._BlockRange object>
CHEROKEE = <phonenumbers.unicode_util._BlockRange object>
CJK_COMPATIBILITY = <phonenumbers.unicode_util._BlockRange object>
CJK_COMPATIBILITY_FORMS = <phonenumbers.unicode_util._BlockRange object>
CJK_COMPATIBILITY_IDEOGRAPHS = <phonenumbers.unicode_util._BlockRange object>
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
CJK_RADICALS_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
CJK_STROKES = <phonenumbers.unicode_util._BlockRange object>
CJK_SYMBOLS_AND_PUNCTUATION = <phonenumbers.unicode_util._BlockRange object>
CJK_UNIFIED_IDEOGRAPHS = <phonenumbers.unicode_util._BlockRange object>
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = <phonenumbers.unicode_util._BlockRange object>
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = <phonenumbers.unicode_util._BlockRange object>
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = <phonenumbers.unicode_util._BlockRange object>
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = <phonenumbers.unicode_util._BlockRange object>
COMBINING_DIACRITICAL_MARKS = <phonenumbers.unicode_util._BlockRange object>
COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
COMBINING_HALF_MARKS = <phonenumbers.unicode_util._BlockRange object>
COMMON_INDIC_NUMBER_FORMS = <phonenumbers.unicode_util._BlockRange object>
CONTROL_PICTURES = <phonenumbers.unicode_util._BlockRange object>
COPTIC = <phonenumbers.unicode_util._BlockRange object>
COUNTING_ROD_NUMERALS = <phonenumbers.unicode_util._BlockRange object>
CUNEIFORM = <phonenumbers.unicode_util._BlockRange object>
CUNEIFORM_NUMBERS_AND_PUNCTUATION = <phonenumbers.unicode_util._BlockRange object>
CURRENCY_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
CYPRIOT_SYLLABARY = <phonenumbers.unicode_util._BlockRange object>
CYRILLIC = <phonenumbers.unicode_util._BlockRange object>
CYRILLIC_EXTENDED_A = <phonenumbers.unicode_util._BlockRange object>
CYRILLIC_EXTENDED_B = <phonenumbers.unicode_util._BlockRange object>
CYRILLIC_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
DESERET = <phonenumbers.unicode_util._BlockRange object>
DEVANAGARI = <phonenumbers.unicode_util._BlockRange object>
DEVANAGARI_EXTENDED = <phonenumbers.unicode_util._BlockRange object>
DINGBATS = <phonenumbers.unicode_util._BlockRange object>
DOMINO_TILES = <phonenumbers.unicode_util._BlockRange object>
EGYPTIAN_HIEROGLYPHS = <phonenumbers.unicode_util._BlockRange object>
EMOTICONS = <phonenumbers.unicode_util._BlockRange object>
ENCLOSED_ALPHANUMERICS = <phonenumbers.unicode_util._BlockRange object>
ENCLOSED_ALPHANUMERIC_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
ENCLOSED_CJK_LETTERS_AND_MONTHS = <phonenumbers.unicode_util._BlockRange object>
ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
ETHIOPIC = <phonenumbers.unicode_util._BlockRange object>
ETHIOPIC_EXTENDED = <phonenumbers.unicode_util._BlockRange object>
ETHIOPIC_EXTENDED_A = <phonenumbers.unicode_util._BlockRange object>
ETHIOPIC_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
GENERAL_PUNCTUATION = <phonenumbers.unicode_util._BlockRange object>
GEOMETRIC_SHAPES = <phonenumbers.unicode_util._BlockRange object>
GEORGIAN = <phonenumbers.unicode_util._BlockRange object>
GEORGIAN_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
GLAGOLITIC = <phonenumbers.unicode_util._BlockRange object>
GOTHIC = <phonenumbers.unicode_util._BlockRange object>
GREEK_AND_COPTIC = <phonenumbers.unicode_util._BlockRange object>
GREEK_EXTENDED = <phonenumbers.unicode_util._BlockRange object>
GUJARATI = <phonenumbers.unicode_util._BlockRange object>
GURMUKHI = <phonenumbers.unicode_util._BlockRange object>
HALFWIDTH_AND_FULLWIDTH_FORMS = <phonenumbers.unicode_util._BlockRange object>
HANGUL_COMPATIBILITY_JAMO = <phonenumbers.unicode_util._BlockRange object>
HANGUL_JAMO = <phonenumbers.unicode_util._BlockRange object>
HANGUL_JAMO_EXTENDED_A = <phonenumbers.unicode_util._BlockRange object>
HANGUL_JAMO_EXTENDED_B = <phonenumbers.unicode_util._BlockRange object>
HANGUL_SYLLABLES = <phonenumbers.unicode_util._BlockRange object>
HANUNOO = <phonenumbers.unicode_util._BlockRange object>
HEBREW = <phonenumbers.unicode_util._BlockRange object>
HIGH_PRIVATE_USE_SURROGATES = <phonenumbers.unicode_util._BlockRange object>
HIGH_SURROGATES = <phonenumbers.unicode_util._BlockRange object>
HIRAGANA = <phonenumbers.unicode_util._BlockRange object>
IDEOGRAPHIC_DESCRIPTION_CHARACTERS = <phonenumbers.unicode_util._BlockRange object>
IMPERIAL_ARAMAIC = <phonenumbers.unicode_util._BlockRange object>
INSCRIPTIONAL_PAHLAVI = <phonenumbers.unicode_util._BlockRange object>
INSCRIPTIONAL_PARTHIAN = <phonenumbers.unicode_util._BlockRange object>
IPA_EXTENSIONS = <phonenumbers.unicode_util._BlockRange object>
JAVANESE = <phonenumbers.unicode_util._BlockRange object>
KAITHI = <phonenumbers.unicode_util._BlockRange object>
KANA_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
KANBUN = <phonenumbers.unicode_util._BlockRange object>
KANGXI_RADICALS = <phonenumbers.unicode_util._BlockRange object>
KANNADA = <phonenumbers.unicode_util._BlockRange object>
KATAKANA = <phonenumbers.unicode_util._BlockRange object>
KATAKANA_PHONETIC_EXTENSIONS = <phonenumbers.unicode_util._BlockRange object>
KAYAH_LI = <phonenumbers.unicode_util._BlockRange object>
KHAROSHTHI = <phonenumbers.unicode_util._BlockRange object>
KHMER = <phonenumbers.unicode_util._BlockRange object>
KHMER_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
LAO = <phonenumbers.unicode_util._BlockRange object>
LATIN_1_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
LATIN_EXTENDED_A = <phonenumbers.unicode_util._BlockRange object>
LATIN_EXTENDED_ADDITIONAL = <phonenumbers.unicode_util._BlockRange object>
LATIN_EXTENDED_B = <phonenumbers.unicode_util._BlockRange object>
LATIN_EXTENDED_C = <phonenumbers.unicode_util._BlockRange object>
LATIN_EXTENDED_D = <phonenumbers.unicode_util._BlockRange object>
LEPCHA = <phonenumbers.unicode_util._BlockRange object>
LETTERLIKE_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
LIMBU = <phonenumbers.unicode_util._BlockRange object>
LINEAR_B_IDEOGRAMS = <phonenumbers.unicode_util._BlockRange object>
LINEAR_B_SYLLABARY = <phonenumbers.unicode_util._BlockRange object>
LISU = <phonenumbers.unicode_util._BlockRange object>
LOW_SURROGATES = <phonenumbers.unicode_util._BlockRange object>
LYCIAN = <phonenumbers.unicode_util._BlockRange object>
LYDIAN = <phonenumbers.unicode_util._BlockRange object>
MAHJONG_TILES = <phonenumbers.unicode_util._BlockRange object>
MALAYALAM = <phonenumbers.unicode_util._BlockRange object>
MANDAIC = <phonenumbers.unicode_util._BlockRange object>
MATHEMATICAL_ALPHANUMERIC_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
MATHEMATICAL_OPERATORS = <phonenumbers.unicode_util._BlockRange object>
MEETEI_MAYEK = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_SYMBOLS_AND_ARROWS = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = <phonenumbers.unicode_util._BlockRange object>
MISCELLANEOUS_TECHNICAL = <phonenumbers.unicode_util._BlockRange object>
MODIFIER_TONE_LETTERS = <phonenumbers.unicode_util._BlockRange object>
MONGOLIAN = <phonenumbers.unicode_util._BlockRange object>
MUSICAL_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
MYANMAR = <phonenumbers.unicode_util._BlockRange object>
MYANMAR_EXTENDED_A = <phonenumbers.unicode_util._BlockRange object>
NEW_TAI_LUE = <phonenumbers.unicode_util._BlockRange object>
NKO = <phonenumbers.unicode_util._BlockRange object>
NUMBER_FORMS = <phonenumbers.unicode_util._BlockRange object>
OGHAM = <phonenumbers.unicode_util._BlockRange object>
OLD_ITALIC = <phonenumbers.unicode_util._BlockRange object>
OLD_PERSIAN = <phonenumbers.unicode_util._BlockRange object>
OLD_SOUTH_ARABIAN = <phonenumbers.unicode_util._BlockRange object>
OLD_TURKIC = <phonenumbers.unicode_util._BlockRange object>
OL_CHIKI = <phonenumbers.unicode_util._BlockRange object>
OPTICAL_CHARACTER_RECOGNITION = <phonenumbers.unicode_util._BlockRange object>
ORIYA = <phonenumbers.unicode_util._BlockRange object>
OSMANYA = <phonenumbers.unicode_util._BlockRange object>
PHAGS_PA = <phonenumbers.unicode_util._BlockRange object>
PHAISTOS_DISC = <phonenumbers.unicode_util._BlockRange object>
PHOENICIAN = <phonenumbers.unicode_util._BlockRange object>
PHONETIC_EXTENSIONS = <phonenumbers.unicode_util._BlockRange object>
PHONETIC_EXTENSIONS_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
PLAYING_CARDS = <phonenumbers.unicode_util._BlockRange object>
PRIVATE_USE_AREA = <phonenumbers.unicode_util._BlockRange object>
REJANG = <phonenumbers.unicode_util._BlockRange object>
RUMI_NUMERAL_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
RUNIC = <phonenumbers.unicode_util._BlockRange object>
SAMARITAN = <phonenumbers.unicode_util._BlockRange object>
SAURASHTRA = <phonenumbers.unicode_util._BlockRange object>
SHAVIAN = <phonenumbers.unicode_util._BlockRange object>
SINHALA = <phonenumbers.unicode_util._BlockRange object>
SMALL_FORM_VARIANTS = <phonenumbers.unicode_util._BlockRange object>
SPACING_MODIFIER_LETTERS = <phonenumbers.unicode_util._BlockRange object>
SPECIALS = <phonenumbers.unicode_util._BlockRange object>
SUNDANESE = <phonenumbers.unicode_util._BlockRange object>
SUPERSCRIPTS_AND_SUBSCRIPTS = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTAL_ARROWS_A = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTAL_ARROWS_B = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTAL_MATHEMATICAL_OPERATORS = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTAL_PUNCTUATION = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTARY_PRIVATE_USE_AREA_A = <phonenumbers.unicode_util._BlockRange object>
SUPPLEMENTARY_PRIVATE_USE_AREA_B = <phonenumbers.unicode_util._BlockRange object>
SYLOTI_NAGRI = <phonenumbers.unicode_util._BlockRange object>
SYRIAC = <phonenumbers.unicode_util._BlockRange object>
TAGALOG = <phonenumbers.unicode_util._BlockRange object>
TAGBANWA = <phonenumbers.unicode_util._BlockRange object>
TAGS = <phonenumbers.unicode_util._BlockRange object>
TAI_LE = <phonenumbers.unicode_util._BlockRange object>
TAI_THAM = <phonenumbers.unicode_util._BlockRange object>
TAI_VIET = <phonenumbers.unicode_util._BlockRange object>
TAI_XUAN_JING_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
TAMIL = <phonenumbers.unicode_util._BlockRange object>
TELUGU = <phonenumbers.unicode_util._BlockRange object>
THAANA = <phonenumbers.unicode_util._BlockRange object>
THAI = <phonenumbers.unicode_util._BlockRange object>
TIBETAN = <phonenumbers.unicode_util._BlockRange object>
TIFINAGH = <phonenumbers.unicode_util._BlockRange object>
TRANSPORT_AND_MAP_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
UGARITIC = <phonenumbers.unicode_util._BlockRange object>
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = <phonenumbers.unicode_util._BlockRange object>
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = <phonenumbers.unicode_util._BlockRange object>
UNKNOWN = <phonenumbers.unicode_util._BlockRange object>
VAI = <phonenumbers.unicode_util._BlockRange object>
VARIATION_SELECTORS = <phonenumbers.unicode_util._BlockRange object>
VARIATION_SELECTORS_SUPPLEMENT = <phonenumbers.unicode_util._BlockRange object>
VEDIC_EXTENSIONS = <phonenumbers.unicode_util._BlockRange object>
VERTICAL_FORMS = <phonenumbers.unicode_util._BlockRange object>
YIJING_HEXAGRAM_SYMBOLS = <phonenumbers.unicode_util._BlockRange object>
YI_RADICALS = <phonenumbers.unicode_util._BlockRange object>
YI_SYLLABLES = <phonenumbers.unicode_util._BlockRange object>

 
class Category(builtins.object)
    General category of a Unicode character.
 
See http://www.unicode.org/reports/tr18/#Categories
 
  Class methods defined here:
get(uni_char) from builtins.type
Return the general category code (as Unicode string) for the given Unicode character

Data descriptors defined here:
__dict__
dictionary for instance variables (if defined)
__weakref__
list of weak references to the object (if defined)

Data and other attributes defined here:
CLOSE_PUNCTUATION = 'Pe'
CONNECTOR_PUNCTUATION = 'Pc'
CONTROL = 'Cc'
CURRENCY_SYMBOL = 'Sc'
DASH_PUNCTUATION = 'Pd'
DECIMAL_DIGIT_NUMBER = 'Nd'
ENCLOSING_MARK = 'Me'
FINAL_PUNCTUATION = 'Pf'
FORMAT = 'Cf'
INITIAL_PUNCTUATION = 'Pi'
LETTER = 'L'
LETTER_NUMBER = 'Nl'
LINE_SEPARATOR = 'Zl'
LOWERCASE_LETTER = 'Ll'
MARK = 'M'
MATH_SYMBOL = 'Sm'
MODIFIER_LETTER = 'Lm'
MODIFIER_SYMBOL = 'Sk'
NON_SPACING_MARK = 'Mn'
NOT_ASSIGNED = 'Cn'
NUMBER = 'N'
OPEN_PUNCTUATION = 'Ps'
OTHER = 'C'
OTHER_LETTER = 'Lo'
OTHER_NUMBER = 'No'
OTHER_PUNCTUATION = 'Po'
OTHER_SYMBOL = 'So'
PARAGRAPH_SEPARATOR = 'Zp'
PRIVATE_USE = 'Co'
PUNCTUATION = 'P'
SEPARATOR = 'Z'
SPACE_SEPARATOR = 'Zs'
SPACING_COMBINING_MARK = 'Mc'
SURROGATE = 'Cs'
SYMBOL = 'S'
TITLECASE_LETTER = 'Lt'
UPPERCASE_LETTER = 'Lu'

 
Functions
       
digit(uni_char, default_value=None)
Returns the digit value assigned to the Unicode character uni_char as
integer. If no such value is defined, default is returned, or, if not
given, ValueError is raised.
is_letter(uni_char)
Determine whether the given Unicode character is a Unicode letter