#%% # pip install -U gensim from pprint import pprint from gensim import models from gensim import corpora from gensim.parsing.preprocessing import preprocess_documents, preprocess_string # some functions like LsiModel.print_topics() outputs to logging streams import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # concepts: Document, Corpus, Vector, Model TEXTS = [ 'this is a document about measuring executive functions using cognitive tests.', 'we measured executive functions skills using stop-signal task.', 'digit span can be used to measure intelligence and executive functions.' ] # Steps to be taken: # - load corpus # - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...) CORPUS = preprocess_documents(TEXTS) # pprint(CORPUS) # create dictionary dictionary = corpora.Dictionary(CORPUS) # pprint(dictionary.token2id) # bag-of-words for a new doc # new_doc = "executive function task/test" # new_vec = dictionary.doc2bow(preprocess_string(new_doc)) # pprint(new_vec) # (id, count) # convert entire corpus to BOW BOW_CORPUS = [dictionary.doc2bow(doc) for doc in CORPUS] # pprint(BOW_CORPUS) # now TF-IDF model # train the model tfidf = models.TfidfModel(BOW_CORPUS) pprint(type(CORPUS)) # transform a query string to bow # query = 'digit_span' # query_bow = dictionary.doc2bow(preprocess_string(query)) # pprint(tfidf[query_bow]) # similarity matrix and querying it # from gensim.similarities import SparseMatrixSimilarity # index = SparseMatrixSimilarity(tfidf[BOW_CORPUS], num_features=13) # sims = index[tfidf[query_bow]] # sims # print(f'Similarities (query: "{query}"):\n-------------') # print('doc_index','score', sep='\t') # for doc_index, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True): # print(doc_index, score, sep='\t') ## LSI # initialize an LSI transformation lsi_model = models.LsiModel(tfidf[BOW_CORPUS], id2word=dictionary) # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi corpus_lsi = lsi_model[tfidf[BOW_CORPUS]] # print top two topics lsi_model.print_topics(2) # print doc-topic assignments for doc, as_text in zip(corpus_lsi, TEXTS): print(doc, as_text) # TODO: apply RP, LDA, and HDP and compare results