w2n
w2n copied to clipboard
My wrapper code
I figured I'd contribute back, this has worked on 5,000 legal documents from many sources. Here is a wrapper I wrote to help supportmy W2N.. Some of this is related to typos in the original documents or badly scanned.
in my case I don't care about pennies so I removed the 100/xx type text. hope this helps someone.
def word2value(val): tens = [ 'twenty', 'thirty', 'forty', 'fifty','sixty', 'seventy', 'eighty', 'ninety']
my_name = sys._getframe().f_code.co_name
if val is not None and len(val) > 0:
val = re.sub("\$,", "", val).lower()
# remove 00/100 or other values
val = re.sub(r' AND \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
val = re.sub(r' & \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
val = re.sub(r' AND NO\/100?', '', val, flags=re.IGNORECASE)
# e.g. four hundred AND two
if val.isnumeric(): #or val.find('and') != -1:
return val
val = re.sub("^venty", "seventy", val, flags=re.IGNORECASE)
val = re.sub("^irty", "thirty", val, flags=re.IGNORECASE)
val = re.sub("^fty", "fifty", val, flags=re.IGNORECASE)
val = re.sub("^neteen", "nineteen", val, flags=re.IGNORECASE)
val = re.sub("^fteen", "fifteen", val, flags=re.IGNORECASE)
val = re.sub("eightfen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub("^ghteen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub(" ghteen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub("sixten", "sixteen", val, flags=re.IGNORECASE)
val = re.sub("^iwo ", "two ", val, flags=re.IGNORECASE)
val = re.sub("^o ", "two ", val, flags=re.IGNORECASE)
val = re.sub("^ven", "seven", val, flags=re.IGNORECASE)
val = re.sub("^even", "seven", val, flags=re.IGNORECASE)
val = re.sub("^ve", "five", val, flags=re.IGNORECASE)
val = re.sub("^x", "six", val, flags=re.IGNORECASE)
val = re.sub("elght", "eight", val, flags=re.IGNORECASE)
val = re.sub("light", "eight", val, flags=re.IGNORECASE)
val = re.sub("^ght", "eight", val, flags=re.IGNORECASE)
val = re.sub("^n ", "ten ", val, flags=re.IGNORECASE)
val = re.sub("^nety", "ninety", val, flags=re.IGNORECASE)
val = re.sub("^elve", "twelve", val, flags=re.IGNORECASE)
# fix hyphenated words so they include the hyphen
for t in tens:
while True:
m = val.find(t)
dash_pos = m + len(t)
if m != -1 and len(val) > dash_pos and val[dash_pos:dash_pos+1].isalpha():
val = val[:dash_pos ] + '-' + val[dash_pos:]
else:
break
try:
val = str(w2n.word_to_num(val))
except ValueError as e:
pass
#print(my_name, ": value to be converted:", val)
return val