diff --git a/topic_modeling/gensim_playground.py b/topic_modeling/gensim_playground.py new file mode 100644 index 0000000..8d386f0 --- /dev/null +++ b/topic_modeling/gensim_playground.py @@ -0,0 +1,86 @@ +#%% + +# pip install -U gensim + +from pprint import pprint +from gensim import models +from gensim import corpora +from gensim.parsing.preprocessing import preprocess_documents, preprocess_string + + +# some functions like LsiModel.print_topics() outputs to logging streams +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + + + +# concepts: Document, Corpus, Vector, Model + +TEXTS = [ + 'this is a document about measuring executive functions using cognitive tests.', + 'we measured executive functions skills using stop-signal task.', + 'digit span can be used to measure intelligence and executive functions.' +] + + +# Steps to be taken: +# - load corpus +# - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...) + + +CORPUS = preprocess_documents(TEXTS) +# pprint(CORPUS) + +# create dictionary +dictionary = corpora.Dictionary(CORPUS) +# pprint(dictionary.token2id) + +# bag-of-words for a new doc +# new_doc = "executive function task/test" +# new_vec = dictionary.doc2bow(preprocess_string(new_doc)) +# pprint(new_vec) # (id, count) + + +# convert entire corpus to BOW +BOW_CORPUS = [dictionary.doc2bow(doc) for doc in CORPUS] +# pprint(BOW_CORPUS) + +# now TF-IDF model +# train the model +tfidf = models.TfidfModel(BOW_CORPUS) + + +pprint(type(CORPUS)) +# transform a query string to bow +# query = 'digit_span' +# query_bow = dictionary.doc2bow(preprocess_string(query)) +# pprint(tfidf[query_bow]) + + +# similarity matrix and querying it +# from gensim.similarities import SparseMatrixSimilarity +# index = SparseMatrixSimilarity(tfidf[BOW_CORPUS], num_features=13) +# sims = index[tfidf[query_bow]] +# sims +# print(f'Similarities (query: "{query}"):\n-------------') +# print('doc_index','score', sep='\t') +# for doc_index, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True): +# print(doc_index, score, sep='\t') + + +## LSI +# initialize an LSI transformation +lsi_model = models.LsiModel(tfidf[BOW_CORPUS], id2word=dictionary) + +# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi +corpus_lsi = lsi_model[tfidf[BOW_CORPUS]] + +# print top two topics +lsi_model.print_topics(2) + +# print doc-topic assignments +for doc, as_text in zip(corpus_lsi, TEXTS): + print(doc, as_text) + + +# TODO: apply RP, LDA, and HDP and compare results