Below this small tool contained has judged unicode whether was the Chinese character, numeral, English, or other characters. The double-byte mark transfers the half-angle mark. Work and so on unicode string of character normalization. Also some can process the multi-sound character the Chinese character to transfer the Pinyin the procedure, but also in reorganization.
#! /usr/bin/env python
# - * - coding:GBK - *-
""" Chinese character processing tool:
Judges unicode whether is the Chinese character, numeral, English, or other characters.
The double-byte mark transfers the half-angle mark."""
__author__= " internetsweeper <zhengbin0713@gmail.com> “
__date__= " 2007-08-04 "
def is_chinese(uchar:
""" judges unicode whether is Chinese character """
if uchar >= u'u4e00' and uchar< =u'u9fa5':
return True
else:
return False
def is_number(uchar:
""" judges unicode whether is digital """
if uchar >= u'u0030' and uchar< =u'u0039':
return True
else:
return False
def is_alphabet(uchar:
""" judges unicode whether is English letter """
if (uchar >= u'u0041' and uchar< =u'u005a' or (uchar >= u'u0061' and uchar< =u'u007a':
return True
else:
return False
def is_other(uchar:
""" judges whether non-Chinese character, digital and English character """
if not (is_chinese(uchar or is_number(uchar or is_alphabet(uchar:
return True
else:
return False
def B2Q(uchar:
The """ half-angle transfers double-byte """
inside_code=ord(uchar
if inside_code< 0x0020 or inside_code> 0x7e: # Is not the half-angle character on the returns original character
return uchar
if inside_code==0x0020: # Except the blank space other double-byte half-angle formula is: Half-angle = double-byte - 0xfee0
inside_code=0x3000
else:
inside_code+=0xfee0
return unichr(inside_code
def Q2B(uchar:
The """ double-byte transfers half-angle """
inside_code=ord(uchar
if inside_code==0x3000:
inside_code=0x0020
else:
inside_code-=0xfee0
if inside_code< 0x0020 or inside_code> 0x7e: After # transfers is not the half-angle character returns original character
return uchar
return unichr(inside_code
def stringQ2B(ustring:
""" transfers the string of character double-byte half-angle """
return "" .join ([Q2B(uchar for uchar in ustring]
def uniform(ustring:
The """ formatting string of character, completes the double-byte to transfer the half-angle, the capital letter transfers the small letter work """
return stringQ2B(ustring.lower(
def string2List(ustring:
""" defers to ustring Chinese, the letter, the numeral separates """
retList=[]
utmp=[]
for uchar in ustring:
if is_other(uchar:
if len(utmp==0:
continue
else:
retList.append ("" .join(utmp
utmp=[]
else:
utmp.append(uchar
if len(utmp! =0:
retList.append ("" .join(utmp
return retList
if __name__== " __main__ “:
#test Q2B and B2Q
for i in range(0x0020,0x007F:
print Q2B(B2Q(unichr(i, B2Q(unichr(i
#test uniform
ustring=u'zhongguo personal name a high frequency A'
ustring=uniform(ustring
ret=string2List(ustring
print ret