import os
extension = 'txt'
extension = 'ml'
extension = 'java'
student_files = [doc for doc in os.listdir() if doc.endswith(f'.{extension}')]
print("students_files:", student_files)
student_notes =[open(File).read() for File in student_files]
students_files: ['Ivann_VYSLANKO.java', 'Ethan_GAUTHIER.java', 'Florian_EPAIN__Salma_BEN_AYAD.java', 'Astrid_PELISSOU.java', 'Clement_DESBROUSSES__Dorian_LAHOCHE_MA1-2.java', 'Adélaïde_MONTEMBAULT__Mealig_LE_GUEVEL_MA1-2.java', 'Mohamed__MOUHIMINE_MA1-2.java', 'Enzo_LEGUY__Albane_CHALLAMEL.java', 'Axel_ALLAIN.java', 'Enzo_SOLDI.java', 'Remi_CAZOULAT.java', 'Andrea-Karol-JAKUBOWSKI.java', 'Louis_LIEUTAUD.java', 'Lena_ARHUIS.java', 'EOuann_AUBRY__Mathias_SALDANHA.java', 'Nouhou_OUSSENI__Emmanuel_LE_PANNERER.java', 'Yoan_PETTORELLI.java', 'Thomas_DERRIEN.java', 'Ariane_NICOLAS.java', 'Yasmine_TELLACHE.java', 'Florian_EPAIN.java', 'Lea_AUBRY__Iska_LE_MENN.java', 'Naoufel_GIRARD_MA1-2.java', 'Ryan_BORCHANI__Ael_COIC.java', 'Tom_CHAUVEAU.java', 'Yann_BALLANGER__Abel_LOCOCGUEN_MA1-2.java', 'Theo_LE_GOC.java', 'Gabriel_STIERER__Romain_SINIC_MA1-2.java', 'Bouchra_BOUSSIF__Youssouf_DIAKITE.java', 'Divi_SINQUIN__Alexane_FAISANT.java', 'Amelie_BREJOT__Clemence_BOUVIER_MA1-2.java', 'Mathurin_GESNY__Jean_MARIN_MA1-2.java', 'Vincent_FARAD.java', 'Camille_GOURVELLEC_Flavie_BANNIER.java', 'Yvan_LEFEVRE.java', 'Antonin_FAGAT.java', 'Clemence_CAVEY__Waly_MOYSE.java', 'Ewen_HEINRY.java', 'Mathilde_LECUYER_Axelle_LE_GUENNEC.java', 'Julien_BOYER.java', 'Nicolas_LE_GUERN__Flavien_LEBRET.java', 'Jean_MARIN__Mathurin_GESNY_MA1-2.java', 'Clement_GESTIN__Enzo_SOLDI.java', 'Dimitri_BERGEAULT.java', 'Asmaa_NAZIH__Manon_CADET.java']
len(student_notes)
len(student_notes[0])
14785
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorize = lambda Text: TfidfVectorizer().fit_transform(Text).toarray()
similarity = lambda doc1, doc2: cosine_similarity([doc1, doc2])
vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
def check_plagiarism(s_vectors):
plagiarism_results = set()
for student_a, text_vector_a in s_vectors:
new_vectors =s_vectors.copy()
current_index = new_vectors.index((student_a, text_vector_a))
del new_vectors[current_index]
for student_b , text_vector_b in new_vectors:
sim_score = similarity(text_vector_a, text_vector_b)[0][1]
student_pair = sorted((student_a, student_b))
score = (student_pair[0], student_pair[1],sim_score)
plagiarism_results.add(score)
return plagiarism_results
Now let's sort by increasing similarity index:
def sort_plagiarism(s_vectors):
data_plagiarism = check_plagiarism(s_vectors)
sorted_data = sorted(data_plagiarism, key=lambda n1n2score: n1n2score[::-1])
return sorted_data
And then filter also:
def filter_plagiarism(sorted_data, threshold=0.70):
return [ scoren1n2 for scoren1n2 in sorted_data if scoren1n2[-1] >= threshold ]
for n1, n2, score in filter_plagiarism(sort_plagiarism(s_vectors)):
name1 = n1.replace(f'.{extension}', '')[:5]
name2 = n2.replace(f'.{extension}', '')[:5]
print(f"Files {name1} and {name2} have similarity = {score:.2%}")
Files Adéla and Cleme have similarity = 70.02% Files Adéla and Enzo_ have similarity = 70.02% Files Camil and Tom_C have similarity = 70.04% Files Bouch and Nicol have similarity = 70.13% Files Bouch and Lea_A have similarity = 70.22% Files Louis and Thoma have similarity = 70.33% Files Thoma and Yvan_ have similarity = 70.33% Files Adéla and Bouch have similarity = 70.34% Files Ameli and Ewen_ have similarity = 70.46% Files Dimit and EOuan have similarity = 70.55% Files Julie and Lena_ have similarity = 70.65% Files Gabri and Ryan_ have similarity = 70.67% Files Camil and Gabri have similarity = 70.74% Files Thoma and Tom_C have similarity = 70.76% Files Nicol and Ryan_ have similarity = 71.04% Files Astri and Nicol have similarity = 71.05% Files Adéla and Dimit have similarity = 71.13% Files Astri and Divi_ have similarity = 71.19% Files Naouf and Yann_ have similarity = 71.28% Files Flori and Flori have similarity = 71.34% Files Andre and Astri have similarity = 71.39% Files Andre and Louis have similarity = 71.44% Files Andre and Yvan_ have similarity = 71.44% Files Camil and Dimit have similarity = 71.48% Files Dimit and Ryan_ have similarity = 71.69% Files Thoma and Yasmi have similarity = 71.72% Files Bouch and Camil have similarity = 71.74% Files Bouch and Julie have similarity = 71.75% Files Divi_ and Yasmi have similarity = 71.79% Files Astri and Louis have similarity = 71.82% Files Astri and Yvan_ have similarity = 71.82% Files Andre and Tom_C have similarity = 71.90% Files Astri and Tom_C have similarity = 71.91% Files Julie and Yann_ have similarity = 71.91% Files Adéla and Divi_ have similarity = 72.04% Files Dimit and Nicol have similarity = 72.05% Files Dimit and Lea_A have similarity = 72.05% Files Nicol and Yasmi have similarity = 72.11% Files Divi_ and Julie have similarity = 72.14% Files EOuan and Nicol have similarity = 72.23% Files Nicol and Yann_ have similarity = 72.28% Files Ameli and Naouf have similarity = 72.40% Files Camil and Julie have similarity = 72.49% Files Adéla and Naouf have similarity = 72.49% Files Ameli and Nouho have similarity = 72.75% Files Gabri and Yann_ have similarity = 72.78% Files Camil and Nicol have similarity = 72.89% Files Divi_ and Nicol have similarity = 72.89% Files Ameli and Gabri have similarity = 73.23% Files Camil and Divi_ have similarity = 73.30% Files Ameli and Bouch have similarity = 73.31% Files Lea_A and Ryan_ have similarity = 73.37% Files EOuan and Lea_A have similarity = 73.78% Files Adéla and Julie have similarity = 73.79% Files Lea_A and Yann_ have similarity = 73.83% Files Divi_ and Ryan_ have similarity = 74.01% Files Astri and Yasmi have similarity = 74.16% Files Adéla and Yann_ have similarity = 74.18% Files Ameli and Divi_ have similarity = 74.19% Files Bouch and Yann_ have similarity = 74.24% Files Divi_ and Lea_A have similarity = 74.27% Files Julie and Lea_A have similarity = 74.37% Files EOuan and Naouf have similarity = 74.37% Files Bouch and Ryan_ have similarity = 74.50% Files Camil and Yann_ have similarity = 74.60% Files Naouf and Ryan_ have similarity = 74.70% Files Adéla and EOuan have similarity = 74.82% Files Camil and Naouf have similarity = 74.83% Files Divi_ and Tom_C have similarity = 75.00% Files Lea_A and Tom_C have similarity = 75.08% Files Camil and EOuan have similarity = 75.23% Files Divi_ and Louis have similarity = 75.24% Files Divi_ and Yvan_ have similarity = 75.24% Files Lea_A and Louis have similarity = 75.38% Files Lea_A and Yvan_ have similarity = 75.38% Files Ryan_ and Yann_ have similarity = 75.43% Files Ameli and Louis have similarity = 75.45% Files Ameli and Yvan_ have similarity = 75.45% Files Adéla and Camil have similarity = 75.45% Files Ameli and Julie have similarity = 75.51% Files Ameli and EOuan have similarity = 75.71% Files Adéla and Lea_A have similarity = 75.76% Files Andre and Yasmi have similarity = 75.84% Files Nouho and Ryan_ have similarity = 76.18% Files Louis and Yasmi have similarity = 76.23% Files Yasmi and Yvan_ have similarity = 76.23% Files Lea_A and Nicol have similarity = 76.26% Files Dimit and Yann_ have similarity = 76.30% Files Tom_C and Yasmi have similarity = 76.31% Files Camil and Lea_A have similarity = 76.52% Files Louis and Nicol have similarity = 76.64% Files Nicol and Yvan_ have similarity = 76.64% Files Ameli and Tom_C have similarity = 76.69% Files Ameli and Nicol have similarity = 76.84% Files Ameli and Camil have similarity = 76.99% Files EOuan and Ryan_ have similarity = 77.02% Files Nicol and Tom_C have similarity = 77.25% Files Julie and Ryan_ have similarity = 77.31% Files Adéla and Ryan_ have similarity = 77.62% Files EOuan and Yann_ have similarity = 77.84% Files Camil and Ryan_ have similarity = 78.16% Files Ameli and Yann_ have similarity = 78.26% Files Ameli and Dimit have similarity = 78.45% Files Adéla and Ameli have similarity = 78.72% Files Bouch and EOuan have similarity = 78.80% Files EOuan and Julie have similarity = 79.47% Files Ameli and Lea_A have similarity = 80.79% Files Ameli and Ryan_ have similarity = 80.87% Files Jean_ and Mathu have similarity = 97.16% Files Ewen_ and Ivann have similarity = 98.14% Files Louis and Tom_C have similarity = 98.99% Files Tom_C and Yvan_ have similarity = 98.99% Files Axel_ and Vince have similarity = 99.08% Files Cleme and Enzo_ have similarity = 100.00% Files Arian and Yoan_ have similarity = 100.00% Files Louis and Yvan_ have similarity = 100.00% Files Remi_ and Theo_ have similarity = 100.00%
It's already pretty good!