Detecting English (or other languages) text
Determining if a message is English (or any language for that matter) can be accomplished by simply loading a dictionary, calculating percentage of words that are in the dictionary, calculating percentage of letters in the message, and checking that these percentages exceed a threshold.
First you need to load your dictionary file. I'm using /usr/share/dict/american-english as my dictionary, but you can pass in your own in the file argument.
import string
ENGLISH_WORDS = {}
def load_language_file():
with open(file, 'r') as dictionary:
word_list = dictionary.read()
for word in word_list.split("\n"):
ENGLISH_WORDS[word.upper()] = True
load_language_file("/usr/share/dict/american-english")
Take your message and view it as a list of words
words_in_msg = msg.split()
Count letters/words
counts = {'words':0, 'letters':0}
for word in words_in_msg:
if word.upper().strip(string.punctuation) in ENGLISH_WORDS:
counts['words']+=1
for char in list(msg):
if char in string.ascii_letters:
counts['letters']+=1
Calculate word and letter percentage
calc_word_percentage = counts['words']/len(words_in_msg)*100
calc_letter_percentage = counts['letters']/len(msg)*100
Is the message valid? Compare the percentages to thresholds. In this case the message must be at least 40% words and 60% letters
word_percentage=40
letter_percentage=60
isValidMsg = calc_word_percentage > word_percentage \
and calc_letter_percentage > letter_percentage
Full Source with text_matches_language function
import string
ENGLISH_WORDS = {}
def load_language_file(file):
with open(file, 'r') as dictionary:
word_list = dictionary.read()
for word in word_list.split("\n"):
ENGLISH_WORDS[word.upper()] = True
def text_matches_language(msg, word_percentage=40, letter_percentage=60, file="/usr/share/dict/american-english"):
if len(ENGLISH_WORDS.keys()) == 0:
load_language_file(file)
counts = {'words':0, 'letters':0}
words_in_msg = msg.split()
for word in words_in_msg:
if word.upper().strip(string.punctuation) in ENGLISH_WORDS:
counts['words']+=1
for char in list(msg):
if char in string.ascii_letters:
counts['letters']+=1
return counts['words']/len(words_in_msg)*100 > word_percentage \
and counts['letters']/len(msg)*100 > letter_percentage