Newer
Older
notebooks / topic_modeling / gensim_playground.py
#%%

# pip install -U gensim

from pprint import pprint
from gensim import models
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string


# some functions like LsiModel.print_topics() outputs to logging streams
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



# concepts: Document, Corpus, Vector, Model

TEXTS = [
  'this is a document about measuring executive functions using cognitive tests.',
  'we measured executive functions skills using stop-signal task.',
  'digit span can be used to measure intelligence and executive functions.'
]


# Steps to be taken:
# - load corpus
# - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...)


CORPUS = preprocess_documents(TEXTS)
# pprint(CORPUS)

# create dictionary
dictionary = corpora.Dictionary(CORPUS)
# pprint(dictionary.token2id)

# bag-of-words for a new doc
# new_doc = "executive function task/test"
# new_vec = dictionary.doc2bow(preprocess_string(new_doc))
# pprint(new_vec)    # (id, count)


# convert entire corpus to BOW
BOW_CORPUS = [dictionary.doc2bow(doc) for doc in CORPUS]
# pprint(BOW_CORPUS)

# now TF-IDF model
# train the model
tfidf = models.TfidfModel(BOW_CORPUS)


pprint(type(CORPUS))
# transform a query string to bow
# query = 'digit_span'
# query_bow = dictionary.doc2bow(preprocess_string(query))
# pprint(tfidf[query_bow])


# similarity matrix and querying it
# from gensim.similarities import SparseMatrixSimilarity
# index = SparseMatrixSimilarity(tfidf[BOW_CORPUS], num_features=13)
# sims = index[tfidf[query_bow]]
# sims
# print(f'Similarities (query: "{query}"):\n-------------')
# print('doc_index','score', sep='\t')
# for doc_index, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
#     print(doc_index, score, sep='\t')


## LSI
# initialize an LSI transformation
lsi_model = models.LsiModel(tfidf[BOW_CORPUS], id2word=dictionary) 

# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi = lsi_model[tfidf[BOW_CORPUS]]

# print top two topics
lsi_model.print_topics(2)

# print doc-topic assignments
for doc, as_text in zip(corpus_lsi, TEXTS):
    print(doc, as_text)


# TODO: apply RP, LDA, and HDP and compare results