Unicode

Characters are not bytes and bytes are not characters (not any more). The Python string type supports Unicode encodings which defines numeric values for a very large set of characters.

ASCII is a subset of Unicode but is limited to the first 127 characters.

In [6]:
# ASCII characters are between 32 and 127
for i in range(32,128):
    print(hex(i),chr(i),end=' ')
0x20   0x21 ! 0x22 " 0x23 # 0x24 $ 0x25 % 0x26 & 0x27 ' 0x28 ( 0x29 ) 0x2a * 0x2b + 0x2c , 0x2d - 0x2e . 0x2f / 0x30 0 0x31 1 0x32 2 0x33 3 0x34 4 0x35 5 0x36 6 0x37 7 0x38 8 0x39 9 0x3a : 0x3b ; 0x3c < 0x3d = 0x3e > 0x3f ? 0x40 @ 0x41 A 0x42 B 0x43 C 0x44 D 0x45 E 0x46 F 0x47 G 0x48 H 0x49 I 0x4a J 0x4b K 0x4c L 0x4d M 0x4e N 0x4f O 0x50 P 0x51 Q 0x52 R 0x53 S 0x54 T 0x55 U 0x56 V 0x57 W 0x58 X 0x59 Y 0x5a Z 0x5b [ 0x5c \ 0x5d ] 0x5e ^ 0x5f _ 0x60 ` 0x61 a 0x62 b 0x63 c 0x64 d 0x65 e 0x66 f 0x67 g 0x68 h 0x69 i 0x6a j 0x6b k 0x6c l 0x6d m 0x6e n 0x6f o 0x70 p 0x71 q 0x72 r 0x73 s 0x74 t 0x75 u 0x76 v 0x77 w 0x78 x 0x79 y 0x7a z 0x7b { 0x7c | 0x7d } 0x7e ~ 0x7f  
In [71]:
# Unicode characters have values >128:
from random import randint
start=randint(128,20000)
for i in range(start,start+64):
    print(hex(i),chr(i),end=' ')
0x4b98 䮘 0x4b99 䮙 0x4b9a 䮚 0x4b9b 䮛 0x4b9c 䮜 0x4b9d 䮝 0x4b9e 䮞 0x4b9f 䮟 0x4ba0 䮠 0x4ba1 䮡 0x4ba2 䮢 0x4ba3 䮣 0x4ba4 䮤 0x4ba5 䮥 0x4ba6 䮦 0x4ba7 䮧 0x4ba8 䮨 0x4ba9 䮩 0x4baa 䮪 0x4bab 䮫 0x4bac 䮬 0x4bad 䮭 0x4bae 䮮 0x4baf 䮯 0x4bb0 䮰 0x4bb1 䮱 0x4bb2 䮲 0x4bb3 䮳 0x4bb4 䮴 0x4bb5 䮵 0x4bb6 䮶 0x4bb7 䮷 0x4bb8 䮸 0x4bb9 䮹 0x4bba 䮺 0x4bbb 䮻 0x4bbc 䮼 0x4bbd 䮽 0x4bbe 䮾 0x4bbf 䮿 0x4bc0 䯀 0x4bc1 䯁 0x4bc2 䯂 0x4bc3 䯃 0x4bc4 䯄 0x4bc5 䯅 0x4bc6 䯆 0x4bc7 䯇 0x4bc8 䯈 0x4bc9 䯉 0x4bca 䯊 0x4bcb 䯋 0x4bcc 䯌 0x4bcd 䯍 0x4bce 䯎 0x4bcf 䯏 0x4bd0 䯐 0x4bd1 䯑 0x4bd2 䯒 0x4bd3 䯓 0x4bd4 䯔 0x4bd5 䯕 0x4bd6 䯖 0x4bd7 䯗 
In [34]:
# we can find the number (ordinal value or Unicode ''code point'') for a character 
c='猫'
print(c,hex(ord(c)))
# or the character for the ordinal 
print(0x732b,chr(0x732b))
猫 0x732b
29483 猫
In [74]:
# different encodings require a different number of bytes
c=chr(0x732b)
print(c,len(c),len(c.encode('utf-8')), len(c.encode('utf-16')))
print(type(c),type(c.encode('utf-8')))
猫 1 3 4
<class 'str'> <class 'bytes'>
In [52]:
# different characters are one character long 
# but require different number of bytes to encode
s='RÖ猫𐒎'
for c in s:
    print(c,hex(ord(c)),len(c),len(c.encode('utf-8')), len(c.encode('utf-16')))
R 0x52 1 1 4
Ö 0xd6 1 2 4
猫 0x732b 1 3 4
𐒎 0x1048e 1 4 6