diff --git a/efo/topic_modeling/gensim_playground.py b/efo/topic_modeling/gensim_playground.py index 8d386f0..b109670 100644 --- a/efo/topic_modeling/gensim_playground.py +++ b/efo/topic_modeling/gensim_playground.py @@ -5,7 +5,8 @@ from pprint import pprint from gensim import models from gensim import corpora -from gensim.parsing.preprocessing import preprocess_documents, preprocess_string +from gensim.parsing.preprocessing import preprocess_documents +from gensim.models import Phrases # some functions like LsiModel.print_topics() outputs to logging streams @@ -16,23 +17,29 @@ # concepts: Document, Corpus, Vector, Model +# TODO analysis: limit the corpus to the task corpus TEXTS = [ 'this is a document about measuring executive functions using cognitive tests.', 'we measured executive functions skills using stop-signal task.', - 'digit span can be used to measure intelligence and executive functions.' + 'digit span can be used to measure intelligence and executive functions.', + 'digit span also comes with different flavours that each taps executive functions differently.', + 'executive functions are not always what we know by digit span, it can be measured differently.', + 'main executive function skills include reglulation and goal-driven behaviors.', + 'digit span is not the only task that matters in executive functions literature' ] - # Steps to be taken: # - load corpus # - preprocessing (tokenize, lower case documents, remove infrequent words, remove stop words, ...) - +# TODO analysis 2: has the task names be phrases CORPUS = preprocess_documents(TEXTS) # pprint(CORPUS) # create dictionary dictionary = corpora.Dictionary(CORPUS) +# bigram = Phrases(CORPUS, min_count=3, threshold=1) +# list(bigram[CORPUS]) # pprint(dictionary.token2id) # bag-of-words for a new doc @@ -83,4 +90,9 @@ print(doc, as_text) +# TODO evalue number of topics per task +# TODO topic similarities across tasks (1 task to many topics and then compare topic vectors) + + + # TODO: apply RP, LDA, and HDP and compare results