Source code for src.algorithms.TextComparator
# importing standard libraries
import string
# importing external libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# getting stopwords
stop_words = stopwords.words('russian')
# getting punctuation marks
punctuation_marks = string.punctuation
# a class for working with texts
[docs]
class TextComparator:
[docs]
@staticmethod
def count_percent_of_transformed_words(input_text_before_changing, input_text_after_changing):
global stop_words, punctuation_marks
if type(input_text_before_changing) != str or type(input_text_after_changing) != str:
raise TypeError("Input must be a strings")
# getting all the words of the text
tokens = []
for sentence in input_text_before_changing, input_text_after_changing:
split_text = sentence.lower().split()
words_of_text = []
for part_of_text in split_text:
if part_of_text not in stop_words:
part_of_text_without_punctuation_marks = ''
for p in part_of_text:
if p not in punctuation_marks:
part_of_text_without_punctuation_marks += p
words_of_text.append(part_of_text_without_punctuation_marks)
tokens.append(words_of_text)
words_of_text_before_changing = tokens[0]
words_of_text_after_changing = tokens[1]
# criterion 1: number of added words
count_of_added_words = 0
for word in words_of_text_after_changing:
if word not in words_of_text_before_changing:
count_of_added_words += 1
# criterion 2: number of deleted words
count_of_deleted_words = 0
for word in words_of_text_before_changing:
if word not in words_of_text_after_changing:
count_of_deleted_words += 1
return round(((count_of_added_words + count_of_deleted_words) / (
len(words_of_text_after_changing) + len(words_of_text_before_changing))) * 100, 2)