Unicode¶

Characters are not bytes and bytes are not characters (not any more). The Python string type supports Unicode encodings which defines numeric values for a very large set of characters.

ASCII is a subset of Unicode but is limited to the first 127 characters.

In [1]:
# ASCII characters are between 32 and 127
for i in range(32,128):
    print(bin(i),chr(i),end='\n')
0b100000  
0b100001 !
0b100010 "
0b100011 #
0b100100 $
0b100101 %
0b100110 &
0b100111 '
0b101000 (
0b101001 )
0b101010 *
0b101011 +
0b101100 ,
0b101101 -
0b101110 .
0b101111 /
0b110000 0
0b110001 1
0b110010 2
0b110011 3
0b110100 4
0b110101 5
0b110110 6
0b110111 7
0b111000 8
0b111001 9
0b111010 :
0b111011 ;
0b111100 <
0b111101 =
0b111110 >
0b111111 ?
0b1000000 @
0b1000001 A
0b1000010 B
0b1000011 C
0b1000100 D
0b1000101 E
0b1000110 F
0b1000111 G
0b1001000 H
0b1001001 I
0b1001010 J
0b1001011 K
0b1001100 L
0b1001101 M
0b1001110 N
0b1001111 O
0b1010000 P
0b1010001 Q
0b1010010 R
0b1010011 S
0b1010100 T
0b1010101 U
0b1010110 V
0b1010111 W
0b1011000 X
0b1011001 Y
0b1011010 Z
0b1011011 [
0b1011100 \
0b1011101 ]
0b1011110 ^
0b1011111 _
0b1100000 `
0b1100001 a
0b1100010 b
0b1100011 c
0b1100100 d
0b1100101 e
0b1100110 f
0b1100111 g
0b1101000 h
0b1101001 i
0b1101010 j
0b1101011 k
0b1101100 l
0b1101101 m
0b1101110 n
0b1101111 o
0b1110000 p
0b1110001 q
0b1110010 r
0b1110011 s
0b1110100 t
0b1110101 u
0b1110110 v
0b1110111 w
0b1111000 x
0b1111001 y
0b1111010 z
0b1111011 {
0b1111100 |
0b1111101 }
0b1111110 ~
0b1111111 
In [2]:
# Unicode characters have values >128:
from random import randint
start=randint(128,20000)
for i in range(start,start+64):
    print(hex(i),chr(i),end=' .. ')
0x3211 ㈑ .. 0x3212 ㈒ .. 0x3213 ㈓ .. 0x3214 ㈔ .. 0x3215 ㈕ .. 0x3216 ㈖ .. 0x3217 ㈗ .. 0x3218 ㈘ .. 0x3219 ㈙ .. 0x321a ㈚ .. 0x321b ㈛ .. 0x321c ㈜ .. 0x321d ㈝ .. 0x321e ㈞ .. 0x321f ㈟ .. 0x3220 ㈠ .. 0x3221 ㈡ .. 0x3222 ㈢ .. 0x3223 ㈣ .. 0x3224 ㈤ .. 0x3225 ㈥ .. 0x3226 ㈦ .. 0x3227 ㈧ .. 0x3228 ㈨ .. 0x3229 ㈩ .. 0x322a ㈪ .. 0x322b ㈫ .. 0x322c ㈬ .. 0x322d ㈭ .. 0x322e ㈮ .. 0x322f ㈯ .. 0x3230 ㈰ .. 0x3231 ㈱ .. 0x3232 ㈲ .. 0x3233 ㈳ .. 0x3234 ㈴ .. 0x3235 ㈵ .. 0x3236 ㈶ .. 0x3237 ㈷ .. 0x3238 ㈸ .. 0x3239 ㈹ .. 0x323a ㈺ .. 0x323b ㈻ .. 0x323c ㈼ .. 0x323d ㈽ .. 0x323e ㈾ .. 0x323f ㈿ .. 0x3240 ㉀ .. 0x3241 ㉁ .. 0x3242 ㉂ .. 0x3243 ㉃ .. 0x3244 ㉄ .. 0x3245 ㉅ .. 0x3246 ㉆ .. 0x3247 ㉇ .. 0x3248 ㉈ .. 0x3249 ㉉ .. 0x324a ㉊ .. 0x324b ㉋ .. 0x324c ㉌ .. 0x324d ㉍ .. 0x324e ㉎ .. 0x324f ㉏ .. 0x3250 ㉐ .. 
In [3]:
# we can find the number (ordinal value or Unicode ''code point'') for a character 
c='਱'
print(c,hex(ord(c)))
# or the character for the ordinal 
print(0x2551,chr(0x2551))
਱ 0xa31
9553 ║
In [4]:
# different encodings require a different number of bytes
c=chr(0x2551)
print(c,len(c),len(c.encode('utf-8')), len(c.encode('utf-16')))
print(type(c),type(c.encode('utf-8')),type(c.encode('utf-16')))
║ 1 3 4
<class 'str'> <class 'bytes'> <class 'bytes'>
In [5]:
# examples of different characters that are one character long 
# but require different number of bytes to encode
s='RÖ猫𐒎'
for c in s:
    print(c,hex(ord(c)),len(c),len(c.encode('utf-8')), len(c.encode('utf-16')))
R 0x52 1 1 4
Ö 0xd6 1 2 4
猫 0x732b 1 3 4
𐒎 0x1048e 1 4 6