#%% # pip install -U gensim from pprint import pprint from gensim import models from gensim import corpora from gensim.parsing.preprocessing import preprocess_documents from gensim.models import Phrases # some functions like LsiModel.print_topics() outputs to logging streams import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # concepts: Document, Corpus, Vector, Model # TODO analysis: limit the corpus to the task corpus TEXTS = [ 'this is a document about measuring executive functions using cognitive tests.', 'we measured executive functions skills using stop-signal task.', 'digit span can be used to measure intelligence and executive functions.', 'digit span also comes with different flavours that each taps executive functions differently.', 'executive functions are not always what we know by digit span, it can be measured differently.', 'main executive function skills include reglulation and goal-driven behaviors.', 'digit span is not the only task that matters in executive functions literature' ] # Steps to be taken: # - load corpus # - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...) # TODO analysis 2: has the task names be phrases CORPUS = preprocess_documents(TEXTS) # pprint(CORPUS) # create dictionary dictionary = corpora.Dictionary(CORPUS) # bigram = Phrases(CORPUS, min_count=3, threshold=1) # list(bigram[CORPUS]) # pprint(dictionary.token2id) # bag-of-words for a new doc # new_doc = "executive function task/test" # new_vec = dictionary.doc2bow(preprocess_string(new_doc)) # pprint(new_vec) # (id, count) # convert entire corpus to BOW BOW_CORPUS = [dictionary.doc2bow(doc) for doc in CORPUS] # pprint(BOW_CORPUS) # now TF-IDF model # train the model tfidf = models.TfidfModel(BOW_CORPUS) pprint(type(CORPUS)) # transform a query string to bow # query = 'digit_span' # query_bow = dictionary.doc2bow(preprocess_string(query)) # pprint(tfidf[query_bow]) # similarity matrix and querying it # from gensim.similarities import SparseMatrixSimilarity # index = SparseMatrixSimilarity(tfidf[BOW_CORPUS], num_features=13) # sims = index[tfidf[query_bow]] # sims # print(f'Similarities (query: "{query}"):\n-------------') # print('doc_index','score', sep='\t') # for doc_index, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True): # print(doc_index, score, sep='\t') ## LSI # initialize an LSI transformation lsi_model = models.LsiModel(tfidf[BOW_CORPUS], id2word=dictionary) # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi corpus_lsi = lsi_model[tfidf[BOW_CORPUS]] # print top two topics lsi_model.print_topics(2) # print doc-topic assignments for doc, as_text in zip(corpus_lsi, TEXTS): print(doc, as_text) # TODO evalue number of topics per task # TODO topic similarities across tasks (1 task to many topics and then compare topic vectors) # TODO: apply RP, LDA, and HDP and compare results