notebooks/efo/topic_modeling/gensim_playground.py at master

Fork: 0
morteza / notebooks
Find file
Newer
Older
notebooks / efo / topic_modeling / gensim_playground.py
Morteza Ansarinia on 6 Mar 2021 2 KB notes
Raw Blame History
#%%

# pip install -U gensim

from pprint import pprint
from gensim import models
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import Phrases


# some functions like LsiModel.print_topics() outputs to logging streams
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



# concepts: Document, Corpus, Vector, Model

# TODO analysis: limit the corpus to the task corpus
TEXTS = [
  'this is a document about measuring executive functions using cognitive tests.',
  'we measured executive functions skills using stop-signal task.',
  'digit span can be used to measure intelligence and executive functions.',
  'digit span also comes with different flavours that each taps executive functions differently.',
  'executive functions are not always what we know by digit span, it can  be measured differently.',
  'main executive function skills include reglulation and goal-driven behaviors.',
  'digit span is not the only task that matters in executive functions literature'
]

# Steps to be taken:
# - load corpus
# - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...)

# TODO analysis 2: has the task names be phrases
CORPUS = preprocess_documents(TEXTS)
# pprint(CORPUS)

# create dictionary
dictionary = corpora.Dictionary(CORPUS)
# bigram = Phrases(CORPUS, min_count=3, threshold=1)
# list(bigram[CORPUS])
# pprint(dictionary.token2id)

# bag-of-words for a new doc
# new_doc = "executive function task/test"
# new_vec = dictionary.doc2bow(preprocess_string(new_doc))
# pprint(new_vec)    # (id, count)


# convert entire corpus to BOW
BOW_CORPUS = [dictionary.doc2bow(doc) for doc in CORPUS]
# pprint(BOW_CORPUS)

# now TF-IDF model
# train the model
tfidf = models.TfidfModel(BOW_CORPUS)


pprint(type(CORPUS))
# transform a query string to bow
# query = 'digit_span'
# query_bow = dictionary.doc2bow(preprocess_string(query))
# pprint(tfidf[query_bow])


# similarity matrix and querying it
# from gensim.similarities import SparseMatrixSimilarity
# index = SparseMatrixSimilarity(tfidf[BOW_CORPUS], num_features=13)
# sims = index[tfidf[query_bow]]
# sims
# print(f'Similarities (query: "{query}"):\n-------------')
# print('doc_index','score', sep='\t')
# for doc_index, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
#     print(doc_index, score, sep='\t')


## LSI
# initialize an LSI transformation
lsi_model = models.LsiModel(tfidf[BOW_CORPUS], id2word=dictionary) 

# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi = lsi_model[tfidf[BOW_CORPUS]]

# print top two topics
lsi_model.print_topics(2)

# print doc-topic assignments
for doc, as_text in zip(corpus_lsi, TEXTS):
    print(doc, as_text)


# TODO evalue number of topics per task
# TODO topic similarities across tasks (1 task to many topics and then compare topic vectors)



# TODO: apply RP, LDA, and HDP and compare results