In [1]:
!pip install gensim -Uq
!pip install tomotopy -Uq
import gensim
import tomotopy as tp
from tqdm import tqdm

gensim.__version__, tp.__version__

('4.1.2', '0.12.2')

In [2]:
# data

import pandas as pd

DATA_FRACTION = .01
PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')
PUBMED['label'] = PUBMED['subcategory']
PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)
PUBMED = PUBMED.dropna(subset=['abstract'])

PUBMED.sample(3)

Unnamed: 0,pmid,doi,year,journal_title,journal_iso_abbreviation,title,abstract,category,subcategory,label
1515,26690807,10.1016/j.neuroimage.2015.12.014,2016,NeuroImage,Neuroimage,When opportunity meets motivation: Neural enga...,social reward process dopaminergic mediate bra...,CognitiveConstruct,RewardProcessing,RewardProcessing
316738,31610410,10.1016/j.psyneuen.2019.104472,2020,Psychoneuroendocrinology,Psychoneuroendocrinology,Association between sleep duration and executi...,executive function define set cognitive skill ...,CognitiveTask,TMT_-_Trail_Making_Task,TMT_-_Trail_Making_Task
185860,21720531,10.3389/fnevo.2011.00001,2011,Frontiers in evolutionary neuroscience,Front Evol Neurosci,Can we measure memes?,meme fundamental unit cultural evolution leave...,CognitiveConstruct,Initiation,Initiation


In [3]:
docs = PUBMED.abstract.to_list()

corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())
corpus.process(docs)
phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)
corpus.concat_ngrams(phrases, delimiter='_')
corpus[2]

<tomotopy.Document with words="startle inhibition weak lead stimulus prepulse inhibition ppi study understand neurobiology information_processing patient community comparison_subject ccs ppi strong genetic basis infrahuman evidence heritability stability reliability human ppi gain increase use endophenotype identify vulnerability gene brain disorder include schizophrenia genetic study employ multiple geographically dispersed test site accommodate need large complex sample assess feasibility ppi multi site study site investigation multiple measure consortium genetic schizophrenia conduct methodological acoustic startle ppi ccs method manualize videotape standardize site intensive person training session equipment acquire program ppi site ucsd stringent quality assurance qa procedure testing complete ccs year primary startle dependent measure eyeblink startle magnitude habituation peak latency latency facilitation ppi analysis_identify significant variability site primary measure determi

In [5]:


mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)
# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)
# mdl.num_beta_sample = 5
# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)

mdl.train(0)

print('Removed top words:', mdl.removed_top_words)


for i in tqdm(range(0, 1000, 20), 'training'):
 mdl.train(20)

mdl.summary()

Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']


training: 100%|██████████| 50/50 [00:14<00:00, 3.53it/s]

<Basic Info>
| LDAModel (current version: 0.12.2)
| 3223 docs, 306091 words
| Total Vocabs: 16648, Used Vocabs: 4511
| Entropy of words: 7.72616
| Entropy of term-weighted words: 7.72616
| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -7.66864
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 10 (minimum collection frequency of words)
| min_df: 5 (minimum document frequency of words)
| rm_top: 20 (the number of top words to be removed)
| k: 100 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic




In [20]:
!pip install bertopic -Uq

import pandas as pd
from bertopic import BERTopic

DATA_FRACTION = .2
PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')
PUBMED['label'] = PUBMED['subcategory']
PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)
PUBMED = PUBMED.dropna(subset=['abstract'])

model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)
topics, scores = model.fit_transform(PUBMED['abstracts'])
model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Batches: 100%|██████████| 101/101 [04:01<00:00, 2.40s/it]
2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings
2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP
2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Unnamed: 0,Topic,Count,Name
0,-1,1075,-1_task_memory_attention_cognitive
1,0,140,0_child_age_work memory_cognitive
2,1,138,1_adhd_attention deficit hyperactivity_deficit...
3,2,119,2_schizophrenia_cognitive_symptom_patient schi...
4,3,87,3_neuron_rat_dopamine_inhibition
...,...,...,...
59,58,12,58_schizophrenia_receptor_nmdar_nmda
60,59,11,59_inhibition_stop signal_motor inhibition_res...
61,60,11,60_categorical perception_categorical_stimulus...
62,61,11,61_bilingual_bilingualism_monolingual_bilingua...
