## List of Functions and Libraries Written and Used During Project import numpy as np from ripser import ripser from persim import plot_diagrams from matplotlib import pyplot as plt import persim import sys import visuals as BetterV FinalLetters = ["a","ɑ","ɛ","e","ǝ","ō","i","I","ū","ʊ","b","d","k","ʃ","ð","f","g","h","j","l","m","n","ɲ","ŋ","p","r","t","s","z","ʧ","v","w","x","ʒ"] def Etruscanvocab(): language_list = "" with open("Shortened_etruscan_words", 'r') as file: for word in file: language_list += word file.close() badwordslist = [] goodwordslist = [] words = language_list.splitlines() badwords = [1328, 838, 400, 419, 984, 1098, 1089, 293, 223, 323, 206, 1617, 189, 240, 519] for i in range(0, len(words)): if i in badwords: badwordslist.append(words[i]) else: goodwordslist.append(words[i]) goodwordsstring = str(goodwordslist) rawletters = list(goodwordsstring) IPAletters =[] for i in range(len(rawletters)): if rawletters[i] == "t": if rawletters[i+1] == "h": IPAletters.append("ð") else: IPAletters.append("t") if rawletters[i] == "p": if rawletters[i+1] == "h": IPAletters.append("f") else: IPAletters.append("p") if rawletters[i] == "z": IPAletters.append("ts") if rawletters[i] == "g": IPAletters.append("k") if rawletters[i] == "c": IPAletters.append("k") if rawletters[i] == "q": IPAletters.append("k") if rawletters[i] == "y": IPAletters.append("u") if rawletters[i] == "x": IPAletters.append("z") if rawletters[i] == "S": IPAletters.append("ʃ") if rawletters[i] == "a": IPAletters.append("ɑ") if rawletters[i] == "h": if rawletters[i-1] != "t": if rawletters[i-1] != "p": IPAletters.append("h") normalletters = ["s", "e", "r", "i", "m", "u", "v", "n", "k", "f", "o", "b", "d", " ", "l"] if rawletters[i] in normalletters: IPAletters.append(rawletters[i]) else: pass return(IPAletters) def Etruscanletters(IPAletters): LetterList = [] for letter in IPAletters: if letter != ',': if letter != ' ': if letter != ']': if letter != '[': if letter != "'": if letter not in LetterList: LetterList.append(letter) return(LetterList) def GetWordsfinalest(date, language, model): words = [] with open(date + "_01_2025_raw_" + language + "_" + model + ".dat", "r") as file: Lines = file.readlines() file.close() guideline = "English : " + language + " : /IPA/" for l in Lines: if guideline in l: l = l.replace(guideline, "") if l != '': try: junk, morejunk, target1 = l.split(': ') for badc in badcharacters: if badc in target1: target1 = target1.replace(badc, "") words.append(target1) except: Bob = None wordsstring = str(words) + " " for morebadc in morebadcharacters: wordsstring = wordsstring.replace(morebadc," ") good_list = list(wordsstring) LetterList = [] for letter in good_list: if letter != " ": if letter not in LetterList: LetterList.append(letter) if letter in GrandLetterList: pass else: GrandLetterList.append(letter) length = len(LetterList) return length, good_list, GrandLetterList def MatrixMaker(good_list): counter0 = 0 arr0 = np.zeros((len(FinalLetters), len(FinalLetters)), dtype=object) for letter1 in FinalLetters: CosineSimilarities = [] points = 0 counter1 = 0 for letter2 in FinalLetters: Contexts1 = [] Contexts2 = [] for i in range(0, len(good_list)): if good_list[i] == letter1: context = [good_list[i-1], good_list[i+1]] Contexts1.append(context) if good_list[i] == letter2: context = [good_list[i-1], good_list[i+1]] Contexts2.append(context) counter = 0 shared_contexts = [] j_contexts = [] i_contexts = [] for i in range(0, len(Contexts1)): for j in range(0, len(Contexts2)): if Contexts1[i] == Contexts2[j]: if j_contexts.count(Contexts2[j]) < Contexts2.count(Contexts2[j]): if i_contexts.count(Contexts1[i]) < Contexts1.count(Contexts1[i]): j_contexts.append(Contexts2[j]) i_contexts.append(Contexts1[i]) counter += 1 else: counter += 0 if counter > len(Contexts2) or counter > len(Contexts1): print(letter1, letter2, counter, len(Contexts1), len(Contexts2)) if len(Contexts1) != 0: if len(Contexts2) != 0: CosineSimilarities.append(counter/((len(Contexts1)**0.5)*(len(Contexts2)**0.5))) points += 1 else: CosineSimilarities.append(0) else: CosineSimilarities.append(0) for element in CosineSimilarities: arr0[counter0][counter1] = element counter1 += 1 counter0 += 1 return good_list, arr0 def LetterSorter(good_list2, listdict): betterlist = [] for letter in good_list2: for list1 in listdict: if letter in listdict[list1]: betterlist.append(list1) betterlist.append(" ") return(betterlist) def ColorPlotter(matrix, FinalLetters, language, model): matrix = matrix.astype(np.float64) plt.figure() plt.imshow(matrix, cmap='viridis', interpolation='nearest') plt.xticks(range(len(FinalLetters)), FinalLetters, size='small') plt.yticks(range(len(FinalLetters)), FinalLetters, size='small') plt.title(language+" Cosine Similarities") plt.colorbar() plt.savefig(language+'_Colorplot_'+model+'.png') def Frobenius(matrix, specialmatrix): frobenius = 0 counter = 0 for letter in matrix: for i in range(len(letter)): frobenius += (specialmatrix[counter][i] - letter[i])**2 counter += 1 return(language, frobenius) def Euclid(listdict, letter0): euclids = {} for letter1 in listdict: for letter2 in listdict: euclid = 0 cosines1 = list(listdict[letter1]) cosines2 = list(listdict[letter2]) for i in range(0, len(cosines1)): euclid += (float(cosines1[i]) - float(cosines2[i]))**2 euclid = euclid**0.5 euclids[letter1, letter2] = euclid fneighbors = [] for letter2 in LetterList: if euclids[letter0, letter2] < 1: fneighbors.append(letter2) return(fneighbors) def diagram_sizes(dgms): return ", ".join([f"$\u03B2_{i}$={len(d)}" for i, d in enumerate(dgms)]) def TDAonly(language, arr0, model): dgm_clean = ripser(arr0)['dgms'] np.save('tda_'+language+'_'+model+'_cluster_final.npy', dgm_clean[0]) np.save('tda_'+language+'_'+model+'_hole_final.npy', dgm_clean[1]) return dgm_clean def Plotterfinalest(dgm_clean, language, model): plt.style.use('./Posters.mplstyle') plt.figure() persim.plot_diagrams( dgm_clean, size=70 ) plt.legend(fontsize=20) plt.title(f"{language}\n{diagram_sizes(dgm_clean)}", fontsize=20) plt.savefig(language + "_" + model + '_Persistence_final.png') def Plotterunreduced(dgm_clean, language, model): plt.style.use('./Posters.mplstyle') plt.figure() clusters1 = np.load('03_17_unreduced/tda_'+language+'_'+model+'_cluster_unreduced.npy') holes1 = np.load('03_17_unreduced/tda_'+language+'_'+model+'_hole_unreduced.npy') dgm = [] dgm.append(clusters1) dgm.append(holes1) persim.plot_diagrams( dgm, size=70 ) plt.legend(fontsize=20) plt.title(f"{language}\n{diagram_sizes(dgm)}", fontsize=20) def Bottleneckunreduced(language1, language2, model1, model2): plt.style.use('./Posters.mplstyle') clusters1 = np.load('03_17_unreduced/tda_'+language1+'_'+model1+'_cluster_unreduced.npy') holes1 = np.load('03_17_unreduced/tda_'+language1+'_'+model1+'_hole_unreduced.npy') clusters2 = np.load('Fixed/tda_'+language2+'_'+model2+'_cluster_unreduced.npy') holes2 = np.load('Fixed/tda_'+language2+'_'+model2+'_hole_unreduced.npy') distance_bottleneck, matching = persim.bottleneck(holes1, holes2, matching=True) plt.figure() BetterV.bottleneck_matching( holes1, holes2, matching, labels=[language1, language2] ) plt.legend(fontsize=20) plt.title(f"$H_1$ Bottleneck distance = {distance_bottleneck:0.4f}", fontsize = 20) plt.show() plt.style.use('./Posters.mplstyle') distance_bottleneck2, matching = persim.bottleneck(clusters1, clusters2, matching=True) plt.figure() BetterV.bottleneck_matching( clusters1, clusters2, matching, labels=[language1, language2] ) plt.legend(fontsize=20) plt.title(f"$H_0$ Bottleneck distance = {distance_bottleneck2:0.4f}", fontsize = 20) plt.show() return distance_bottleneck, distance_bottleneck2 def Bottleneckreduced(language1, language2, model1, model2): plt.style.use('./Posters.mplstyle') clusters1 = np.load('tda_'+language1+'_'+model1+'_cluster_final.npy') holes1 = np.load('tda_'+language1+'_'+model1+'_hole_final.npy') clusters2 = np.load('Fixed/tda_'+language2+'_'+model2+'_cluster_final.npy') holes2 = np.load('Fixed/tda_'+language2+'_'+model2+'_hole_final.npy') distance_bottleneck, matching = persim.bottleneck(holes1, holes2, matching=True) plt.figure() BetterV.bottleneck_matching( holes1, holes2, matching, labels=[language1, language2] ) plt.legend(fontsize=20) plt.title(f"$H_1$ Bottleneck distance = {distance_bottleneck:0.4f}", fontsize = 20) plt.show() plt.style.use('./Posters.mplstyle') distance_bottleneck2, matching = persim.bottleneck(clusters1, clusters2, matching=True) plt.figure() BetterV.bottleneck_matching( clusters1, clusters2, matching, labels=[language1, language2] #ax=ax ) plt.legend(fontsize=20) plt.title(f"$H_0$ Bottleneck distance = {distance_bottleneck2:0.4f}", fontsize = 20) plt.show() return distance_bottleneck, distance_bottleneck2 def SlicedWasserstein(language, model): clusters = np.load('GPTAnalysis/tda_'+language+'_'+model+'_cluster_final.npy') holes = np.load('GPTAnalysis/tda_'+language+'_'+model+'_hole_final.npy') Eclusters = np.load('Fixed/tda_Etruscan___cluster_final.npy') Eholes = np.load('Fixed/tda_Etruscan___hole_final.npy') clusters1 = np.delete(clusters, [-1],axis=0) Eclusters1 = np.delete(Eclusters, [-1],axis=0) clustercomp = persim.sliced_wasserstein(np.array(clusters1), np.array(Eclusters1)) holecomp = persim.sliced_wasserstein(holes, Eholes) return clustercomp, holecomp ## Example Code Used to Run LLM from Ollama Server modelname = 'command-r:latest' modeltag = 'command' language = "Dutch" assistant_message = "You are an assistant, do what the user tells you to do properly." chat_message = "I will give you a number of words to translate into " + str(language) chat_message = chat_message +". Provide the International Phonetic Alphabet. Do not provide any notes or commentary. " chat_message = chat_message +"Use the format: English : "+ str(language) + " : /IPA/" from llama_index.llms.ollama import Ollama from llama_index.core.llms import ChatMessage llm = Ollama(model=modelname) languagetag = language.replace(" ", "") messages = [ ChatMessage(role="assistant", content=assistant_message), ChatMessage(role="user", content=chat_message) ] resp1 = llm.chat(messages) print(resp1) def BetterWordfinder(text): resp = llm.chat([ChatMessage(role="user", content=f"{chat_message} {text}")]) content = None for item in resp: if isinstance(item, tuple) and item[0] == 'message': content = item[1].content break summary = content print(summary) return summary text = '' i = 0 BatchSize = 5 with open('Shortened_word_list', 'r') as file: defs = file.readlines() file.close() summaries = "" counter = 0 for definition in defs: if i < BatchSize: text = text+definition i = i+1 if i == BatchSize: print(text) summary = BetterWordfinder(text) summaries += summary counter += 1 text = '' i = 0 with open('14_01_2025_raw_' + str(languagetag) + '_' + str(modeltag) + '.dat', 'w') as file: file.write(summaries) file.close() ## Example Code used to Run ChatGPT4o from OpenAI Server from openai import OpenAI modelname = 'gpt-4o' modeltag = 'gpt4o' language = "Sanskrit" chat_message = "I will give you a number of words to translate into " + str(language) chat_message = chat_message +". Provide the International Phonetic Alphabet. Do not provide any notes or commentary. " chat_message = chat_message +"Use the format: English : "+ str(language) + " : /IPA/" role="assistant" languagetag = language.replace(" ", "") def BetterWordfinder(modelname, chat_message, text, client): resp = client.chat.completions.create(model= modelname, messages = [{"role":"user","content": chat_message+text}] , temperature=0) print(resp.choices[0].message.content) return resp.choices[0].message.content text = '' i = 0 BatchSize = 5 with open('Shortened_word_list', 'r') as file: defs = file.readlines() file.close() summaries = "" counter = 0 for definition in defs: if i < BatchSize: text = text+definition i = i+1 if i == BatchSize: print(text) summary = BetterWordfinder(modelname, chat_message, text, client) summaries += "\n" + summary counter += 1 text = '' i = 0 with open('18_03_2025_raw_' + str(languagetag) + '_' + str(modeltag) + '.dat', 'w') as file: file.write(summaries) file.close() ## Example Script Creating Cosine Matrices for Mistral-Large Translations, Plotting them as Colorplots, and Performing TDA on Data using Reduced and Unreduced IPA Character Set Largelanguages = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "ModernGreek","Sanskrit"] for language, good_list in Largelanguages: unwords, undata = MatrixMaker(good_list) untda = TDAonly(language, undata, "mistrallarge") unpersist = Plotterunreducedlarge(undata, language, "mistrallarge") betterlist = LetterSorter(good_list, listdict) words, data = MatrixMaker(betterlist) coloplot2 = ColorPlotter(data, FinalLetters, language, "mistrallarge") tda = TDAonly(language, data, "mistrallarge") persist = Plotterfinalestlarge(data, language, "mistrallarge") ## Example Script Finding the Bottleneck Distance Between Etruscan and Every Language Translation from Each LLM using Reduced and Unreduced IPA Character Set languagess = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "ModernGreek","Sanskrit"] languagesc = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "Icelandic", "ModernGreek","Sanskrit"] languagesn = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "Icelandic", "ModernGreek","Sanskrit"] languagesh = ["Breton","Latin"] languagesl = ["Breton","Latin"] languagesl2 = ["HomericGreek", "KoineGreek"] languages3 = ["Hittite", "Icelandic", "ModernGreek","Sanskrit"] gpt = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "Icelandic", "ModernGreek","Sanskrit"] H0unreduced = {"gpt4o":[], "mistral-large":[],"mistral-small":[], "nous-hermes":[], "command":[], "mistral-nemo":[]} H1unreduced = {"gpt4o":[], "mistral-large":[],"mistral-small":[], "nous-hermes":[], "command":[], "mistral-nemo":[]} H0 = {"gpt4o":[], "mistral-large":[],"mistral-small":[], "nous-hermes":[], "command":[], "mistral-nemo":[]} H1 = {"gpt4o":[], "mistral-large":[],"mistral-small":[], "nous-hermes":[], "command":[], "mistral-nemo":[]} unsmallH0 = [] unsmallH1 = [] smallH0 = [] smallH1 = [] small = [] Smallfrobs = [] for language, betterlist in Small: undistance1, undistance0 = Bottleneckunreduced(language, "Etruscan", "mistralsmall", "_") unsmallH1.append(undistance1) unsmallH0.append(undistance0) distance1, distance0 = Bottleneckreduced(language, "Etruscan", "mistralsmall", "_") smallH1.append(distance1) smallH0.append(distance0) for language in languages: if language in languagess: H0unreduced["mistral-small"].append(unsmallH0[languagess.index(language)]) H1unreduced["mistral-small"].append(unsmallH1[languagess.index(language)]) H0["mistral-small"].append(smallH0[languagess.index(language)]) H1["mistral-small"].append(smallH1[languagess.index(language)]) else: H0unreduced["mistral-small"].append(0) H1unreduced["mistral-small"].append(0) H0["mistral-small"].append(0) H1["mistral-small"].append(0) uncommandH0 = [] uncommandH1 = [] commandH0 = [] commandH1 = [] command = [] Commandfrobs = [] for language, betterlist in Command: undistance1, undistance0 = Bottleneckunreduced(language, "Etruscan", "command", "_") uncommandH1.append(undistance1) uncommandH0.append(undistance0) distance1, distance0 = Bottleneckreduced(language, "Etruscan", "command", "_") commandH1.append(distance1) commandH0.append(distance0) for language in languages: if language in languagesc: H0unreduced["command"].append(uncommandH0[languagesc.index(language)]) H1unreduced["command"].append(uncommandH1[languagesc.index(language)]) H0["command"].append(commandH0[languagesc.index(language)]) H1["command"].append(commandH1[languagesc.index(language)]) else: H0unreduced["command"].append(0) H1unreduced["command"].append(0) H0["command"].append(0) H1["command"].append(0) unnemoH0 = [] unnemoH1 = [] nemoH0 = [] nemoH1 = [] nemo = [] Nemofrobs = [] for language, betterlist in Nemo: undistance1, undistance0 = Bottleneckunreduced(language, "Etruscan", "mistralnemo", "_") nemoH0.append(undistance0) nemoH1.append(undistance1) distance1, distance0 = Bottleneckreduced(language, "Etruscan", "mistralnemo", "_") unnemoH1.append(distance1) unnemoH0.append(distance0) for language in languages: if language in languagesn: H0unreduced["mistral-nemo"].append(unnemoH0[languagesn.index(language)]) H1unreduced["mistral-nemo"].append(unnemoH1[languagesn.index(language)]) H0["mistral-nemo"].append(nemoH0[languagesn.index(language)]) H1["mistral-nemo"].append(nemoH1[languagesn.index(language)]) else: H0unreduced["mistral-nemo"].append(0) H1unreduced["mistral-nemo"].append(0) H0["mistral-nemo"].append(0) H1["mistral-nemo"].append(0) unhermesH0 = [] unhermesH1 = [] hermesH0 = [] hermesH1 = [] hermes = [] Hermesfrobs = [] for language, betterlist in Hermes: undistance1, undistance0 = Bottleneckunreduced(language, "Etruscan", "noushermes", "_") unhermesH0.append(undistance0) unhermesH1.append(undistance1) distance1, distance0 = Bottleneckreduced(language, "Etruscan", "noushermes", "_") hermesH1.append(distance1) hermesH0.append(distance0) for language in languages: if language in languagesh: H0unreduced["nous-hermes"].append(unhermesH0[languagesh.index(language)]) H1unreduced["nous-hermes"].append(unhermesH1[languagesh.index(language)]) H0["nous-hermes"].append(hermesH0[languagesh.index(language)]) H1["nous-hermes"].append(hermesH1[languagesh.index(language)]) else: H0unreduced["nous-hermes"].append(0) H1unreduced["nous-hermes"].append(0) H0["nous-hermes"].append(0) H1["nous-hermes"].append(0) largeall = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "Icelandic", "ModernGreek","Sanskrit"] unlargeH0 = [] unlargeH1 = [] largeH0 = [] largeH1 = [] large = [] Largefrobs = [] for language, betterlist in Large: undistance1, undistance0 = Bottleneckunreduced(language, "Etruscan", "mistrallarge", "_") unlargeH0.append(undistance0) unlargeH1.append(undistance1) distance1, distance0 = Bottleneckreduced(language, "Etruscan", "mistrallarge", "_") largeH1.append(distance1) largeH0.append(distance0) for language in languages: if language in largeall: H0unreduced["mistral-large"].append(unlargeH0[largeall.index(language)]) H1unreduced["mistral-large"].append(unlargeH1[largeall.index(language)]) H0["mistral-large"].append(largeH0[largeall.index(language)]) H1["mistral-large"].append(largeH1[largeall.index(language)]) else: H0unreduced["mistral-large"].append(0) H1unreduced["mistral-large"].append(0) H0["mistral-large"].append(0) H1["mistral-large"].append(0) gpt = ["Breton", "Latin", "HomericGreek","KoineGreek","Hittite", "Icelandic", "ModernGreek","Sanskrit"] ungptH0 = [] ungptH1 = [] gptH0 = [] gptH1 = [] Gptfrobs = [] for language, betterlist in GPT: undistance1, undistance0 = BottleneckunreducedGPT(language, "Etruscan", "gpt4o", "_") ungptH0.append(undistance0) ungptH1.append(undistance1) distance1, distance0 = BottleneckGPT(language, "Etruscan", "gpt4o", "_") gptH1.append(distance1) gptH0.append(distance0) for language in languages: if language in gpt: H0unreduced["gpt4o"].append(ungptH0[gpt.index(language)]) H1unreduced["gpt4o"].append(ungptH1[gpt.index(language)]) H0["gpt4o"].append(gptH0[gpt.index(language)]) H1["gpt4o"].append(gptH1[gpt.index(language)]) else: H0unreduced["gpt4o"].append(0) H1unreduced["gpt4o"].append(0) H0["gpt4o"].append(0) H1["gpt4o"].append(0) Plotlanguages = ["Breton", "Latin", "Homeric\nGreek","Koine\nGreek","Hittite", "Icelandic", "Modern\nGreek","Sanskrit"] x = np.arange(len(Plotlanguages)) # the label locations width = 0.1 # the width of the bars multiplier = 0 plt.figure() fig, ax = plt.subplots(layout='constrained') for attribute, measurement in H0unreduced.items(): offset = width * multiplier rects = ax.bar(x + offset, measurement, width, label=attribute) #this line was wrong multiplier += 1 ax.set_ylabel('H$_0$') ax.set_title('Bottleneck Distance between Clusters Unreduced') ax.set_xticks(x + width, Plotlanguages) ax.legend(loc='upper left', ncols=2) #ax.set_ylim(0, 3.5) plt.tight_layout() plt.savefig('03_17_unreduced/H0_unreduced.pdf') x = np.arange(len(Plotlanguages)) # the label locations width = 0.1 # the width of the bars multiplier = 0 plt.figure() fig, ax = plt.subplots(layout='constrained') for attribute, measurement in H1unreduced.items(): offset = width * multiplier rects = ax.bar(x + offset, measurement, width, label=attribute) multiplier += 1 ax.set_ylabel('H$_1$') ax.set_title('Bottleneck Distance between Holes Unreduced') ax.set_xticks(x + width, Plotlanguages) ax.legend(loc='upper left', ncols=2) plt.tight_layout() plt.savefig('03_17_unreduced/H1_unreduced.pdf') x = np.arange(len(Plotlanguages)) # the label locations width = 0.1 # the width of the bars multiplier = 0 plt.figure() fig, ax = plt.subplots(layout='constrained') for attribute, measurement in H0.items(): offset = width * multiplier rects = ax.bar(x + offset, measurement, width, label=attribute) #this line was wrong multiplier += 1 ax.set_ylabel('H$_0$') ax.set_title('Bottleneck Distance between Clusters') ax.set_xticks(x + width, Plotlanguages) ax.legend(loc='upper left', ncols=2) ax.set_ylim(0, 0.7) plt.tight_layout() plt.savefig('03_16_runs/H0.pdf') x = np.arange(len(Plotlanguages)) # the label locations width = 0.1 # the width of the bars multiplier = 0 plt.figure() fig, ax = plt.subplots(layout='constrained') for attribute, measurement in H1.items(): offset = width * multiplier rects = ax.bar(x + offset, measurement, width, label=attribute) multiplier += 1 ax.set_ylabel('H$_1$') ax.set_title('Bottleneck Distance between Holes') ax.set_xticks(x + width, Plotlanguages) ax.legend(loc='upper left', ncols=2) ax.set_ylim(0, 0.2) plt.tight_layout() plt.savefig('03_16_runs/H1.pdf')