diff --git a/tfp_coursera/topic_modeling_playground.ipynb b/tfp_coursera/topic_modeling_playground.ipynb
new file mode 100644
index 0000000..c4b88fb
--- /dev/null
+++ b/tfp_coursera/topic_modeling_playground.ipynb
@@ -0,0 +1,572 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('4.1.2', '0.12.2')"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "!pip install gensim -Uq\n",
+ "!pip install tomotopy -Uq\n",
+ "import gensim\n",
+ "import tomotopy as tp\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "gensim.__version__, tp.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pmid | \n",
+ " doi | \n",
+ " year | \n",
+ " journal_title | \n",
+ " journal_iso_abbreviation | \n",
+ " title | \n",
+ " abstract | \n",
+ " category | \n",
+ " subcategory | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1515 | \n",
+ " 26690807 | \n",
+ " 10.1016/j.neuroimage.2015.12.014 | \n",
+ " 2016 | \n",
+ " NeuroImage | \n",
+ " Neuroimage | \n",
+ " When opportunity meets motivation: Neural enga... | \n",
+ " social reward process dopaminergic mediate bra... | \n",
+ " CognitiveConstruct | \n",
+ " RewardProcessing | \n",
+ " RewardProcessing | \n",
+ "
\n",
+ " \n",
+ " 316738 | \n",
+ " 31610410 | \n",
+ " 10.1016/j.psyneuen.2019.104472 | \n",
+ " 2020 | \n",
+ " Psychoneuroendocrinology | \n",
+ " Psychoneuroendocrinology | \n",
+ " Association between sleep duration and executi... | \n",
+ " executive function define set cognitive skill ... | \n",
+ " CognitiveTask | \n",
+ " TMT_-_Trail_Making_Task | \n",
+ " TMT_-_Trail_Making_Task | \n",
+ "
\n",
+ " \n",
+ " 185860 | \n",
+ " 21720531 | \n",
+ " 10.3389/fnevo.2011.00001 | \n",
+ " 2011 | \n",
+ " Frontiers in evolutionary neuroscience | \n",
+ " Front Evol Neurosci | \n",
+ " Can we measure memes? | \n",
+ " meme fundamental unit cultural evolution leave... | \n",
+ " CognitiveConstruct | \n",
+ " Initiation | \n",
+ " Initiation | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pmid doi year \\\n",
+ "1515 26690807 10.1016/j.neuroimage.2015.12.014 2016 \n",
+ "316738 31610410 10.1016/j.psyneuen.2019.104472 2020 \n",
+ "185860 21720531 10.3389/fnevo.2011.00001 2011 \n",
+ "\n",
+ " journal_title journal_iso_abbreviation \\\n",
+ "1515 NeuroImage Neuroimage \n",
+ "316738 Psychoneuroendocrinology Psychoneuroendocrinology \n",
+ "185860 Frontiers in evolutionary neuroscience Front Evol Neurosci \n",
+ "\n",
+ " title \\\n",
+ "1515 When opportunity meets motivation: Neural enga... \n",
+ "316738 Association between sleep duration and executi... \n",
+ "185860 Can we measure memes? \n",
+ "\n",
+ " abstract category \\\n",
+ "1515 social reward process dopaminergic mediate bra... CognitiveConstruct \n",
+ "316738 executive function define set cognitive skill ... CognitiveTask \n",
+ "185860 meme fundamental unit cultural evolution leave... CognitiveConstruct \n",
+ "\n",
+ " subcategory label \n",
+ "1515 RewardProcessing RewardProcessing \n",
+ "316738 TMT_-_Trail_Making_Task TMT_-_Trail_Making_Task \n",
+ "185860 Initiation Initiation "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# data\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "DATA_FRACTION = .01\n",
+ "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')\n",
+ "PUBMED['label'] = PUBMED['subcategory']\n",
+ "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n",
+ "PUBMED = PUBMED.dropna(subset=['abstract'])\n",
+ "\n",
+ "PUBMED.sample(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs = PUBMED.abstract.to_list()\n",
+ "\n",
+ "corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())\n",
+ "corpus.process(docs)\n",
+ "phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)\n",
+ "corpus.concat_ngrams(phrases, delimiter='_')\n",
+ "corpus[2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "training: 100%|██████████| 50/50 [00:14<00:00, 3.53it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "| LDAModel (current version: 0.12.2)\n",
+ "| 3223 docs, 306091 words\n",
+ "| Total Vocabs: 16648, Used Vocabs: 4511\n",
+ "| Entropy of words: 7.72616\n",
+ "| Entropy of term-weighted words: 7.72616\n",
+ "| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus\n",
+ "|\n",
+ "\n",
+ "| Iterations: 1000, Burn-in steps: 0\n",
+ "| Optimization Interval: 10\n",
+ "| Log-likelihood per word: -7.66864\n",
+ "|\n",
+ "\n",
+ "| tw: TermWeight.ONE\n",
+ "| min_cf: 10 (minimum collection frequency of words)\n",
+ "| min_df: 5 (minimum document frequency of words)\n",
+ "| rm_top: 20 (the number of top words to be removed)\n",
+ "| k: 100 (the number of topics between 1 ~ 32767)\n",
+ "| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)\n",
+ "| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)\n",
+ "| seed: 3272832591 (random seed)\n",
+ "| trained in version 0.12.2\n",
+ "|\n",
+ "\n",
+ "| alpha (Dirichlet prior on the per-document topic distributions)\n",
+ "| [0.04364107 0.01068732 0.02508776 0.03441986 0.01844739 0.07034776\n",
+ "| 0.03123164 0.01434407 0.04665712 0.16142593 0.02718705 0.06977499\n",
+ "| 0.03755865 0.07443139 0.09644692 0.02221145 0.03520738 0.02862341\n",
+ "| 0.03875916 0.00740024 0.09041092 0.01950175 0.05477431 0.19435379\n",
+ "| 0.0582406 0.03943058 0.01880877 0.04164614 0.01150252 0.01499599\n",
+ "| 0.01659403 0.01239366 0.0205057 0.10825807 0.0132186 0.02321242\n",
+ "| 0.03782552 0.06962249 0.01563781 0.04341785 0.01747116 0.02702065\n",
+ "| 0.03356255 0.01622541 0.01230451 0.803505 0.03779493 0.03383209\n",
+ "| 0.09095947 0.19676776 0.01925627 0.01540721 0.04677161 0.05433749\n",
+ "| 0.01273535 0.01604453 0.02718877 0.04785496 0.04703167 0.02565646\n",
+ "| 0.02726313 0.01356411 0.17673622 0.01084241 0.01024968 0.02239775\n",
+ "| 0.03813412 0.02604754 0.023192 0.09352055 0.02686057 0.00997292\n",
+ "| 0.03537464 0.08052384 0.01368217 0.014366 0.03178743 0.01446226\n",
+ "| 0.02323432 0.07920136 0.02016016 0.01608375 0.0120501 0.02995271\n",
+ "| 0.05394103 0.03003465 0.01996591 0.03699729 0.0255818 0.04365746\n",
+ "| 0.01982757 0.04235343 0.04885878 0.03152624 0.0164482 0.0290758\n",
+ "| 0.04606192 0.02331102 0.0384533 0.04088598]\n",
+ "| eta (Dirichlet prior on the per-topic word distribution)\n",
+ "| 0.01\n",
+ "|\n",
+ "\n",
+ "| #0 (2986) : inhibition response_inhibition inhibitory_control stop impulsivity\n",
+ "| #1 (1184) : pd parkinson_disease pd_patient motor dbs\n",
+ "| #2 (1677) : motivation action effort reward cpt\n",
+ "| #3 (2314) : auditory visual perception sound sensory\n",
+ "| #4 (999) : r number inhibition s lie\n",
+ "| #5 (3681) : model base analysis different datum\n",
+ "| #6 (2372) : reward decision_making decision choice behavior\n",
+ "| #7 (1073) : injury tbi head traumatic_brain moral\n",
+ "| #8 (2837) : age old old_adult adult young\n",
+ "| #9 (7538) : use provide method tool datum\n",
+ "| #10 (3016) : neuron cell spike channel activity\n",
+ "| #11 (3482) : reaction_time trial accuracy slow fast\n",
+ "| #12 (2484) : lesion system nucleus dorsal pathway\n",
+ "| #13 (4177) : disease disorder onset clinical syndrome\n",
+ "| #14 (5125) : experiment model process information account\n",
+ "| #15 (1643) : surgery case cerebral surgical postoperative\n",
+ "| #16 (2470) : emotional emotion affective negative face\n",
+ "| #17 (2939) : adhd hyperactivity attention_deficit disorder symptom\n",
+ "| #18 (3154) : brain volume mri structural lesion\n",
+ "| #19 (856) : smoker nicotine smoking smoke craving\n",
+ "| #20 (5716) : p score subject assess scale\n",
+ "| #21 (1829) : mrna brain expression gut acid\n",
+ "| #22 (4467) : rat administration dose drug day\n",
+ "| #23 (12672) : function deficit executive executive_function impairment\n",
+ "| #24 (3082) : adolescent adult early development age\n",
+ "| #25 (2214) : attentional information threat bias processing\n",
+ "| #26 (1315) : sleep pain ds chronic fatigue\n",
+ "| #27 (3506) : schizophrenia symptom deficit psychosis patient_schizophrenia\n",
+ "| #28 (915) : epilepsy seizure hd sign tle\n",
+ "| #29 (1615) : mutation gene variant genetic protein\n",
+ "| #30 (1509) : use user cocaine drug cannabis\n",
+ "| #31 (1295) : white_matter fa integrity tract cp\n",
+ "| #32 (1200) : n hc compare non es\n",
+ "| #33 (4691) : high low individual level state\n",
+ "| #34 (1280) : food weight obesity bmi eat\n",
+ "| #35 (1365) : d c b intensity acute\n",
+ "| #36 (2645) : depression symptom mood depressive_symptom depressive\n",
+ "| #37 (3946) : social self people experience support\n",
+ "| #38 (1357) : tumor glioma treatment brain_tumor brain\n",
+ "| #39 (5041) : mouse receptor induce protein dopamine\n",
+ "| #40 (1395) : mindfulness meditation thought experience self\n",
+ "| #41 (1655) : stress psychological cope caregiver health\n",
+ "| #42 (1855) : condition dual_task walk gait experiment\n",
+ "| #43 (1269) : asd disorder autism td autistic\n",
+ "| #44 (1122) : speech suicide vwm noise suicidal\n",
+ "| #45 (37001) : finding suggest present find different\n",
+ "| #46 (2553) : motor movement hand action execution\n",
+ "| #47 (3329) : network connectivity functional brain functional_connectivity\n",
+ "| #48 (4707) : signal activity process event temporal\n",
+ "| #49 (11143) : research review approach provide process\n",
+ "| #50 (1386) : reading read processing sentence word\n",
+ "| #51 (881) : ms sequence category implicit explicit\n",
+ "| #52 (4167) : ef parent behavior problem family\n",
+ "| #53 (4081) : target cue location spatial attentional\n",
+ "| #54 (697) : figure bias ability matrix complex\n",
+ "| #55 (1464) : infant maternal mother birth prenatal\n",
+ "| #56 (2808) : ad dementia mci alzheimer_disease impairment\n",
+ "| #57 (2911) : object visual face novel processing\n",
+ "| #58 (2942) : recall retrieval episodic_memory item word\n",
+ "| #59 (1782) : conflict error trial monitoring congruent\n",
+ "| #60 (1890) : word interference stroop color present\n",
+ "| #61 (1189) : capacity functional cbt al domain\n",
+ "| #62 (10154) : year association score predict low\n",
+ "| #63 (1339) : genotype association polymorphism genetic gene\n",
+ "| #64 (766) : mdd apathy fc cortisol axis\n",
+ "| #65 (1730) : level plasma csf blood β\n",
+ "| #66 (2108) : learning learn feedback skill new\n",
+ "| #67 (2546) : eeg activity power theta alpha\n",
+ "| #68 (1367) : context cost switch cue repeat\n",
+ "| #69 (7060) : treatment intervention week improve month\n",
+ "| #70 (1886) : pfc activation dlpfc activity prefrontal_cortex\n",
+ "| #71 (750) : creativity creative pa thinking clozapine\n",
+ "| #72 (2488) : risk ci hiv prevalence use\n",
+ "| #73 (7708) : activation region cortex right area\n",
+ "| #74 (1017) : aggression social aggressive offender li\n",
+ "| #75 (1312) : alcohol drinking alcohol_use drink substance_use\n",
+ "| #76 (2292) : right left orientation body space\n",
+ "| #77 (1210) : rat da se l animal\n",
+ "| #78 (1888) : language word semantic bilingual fluency\n",
+ "| #79 (3829) : time delay duration interval day\n",
+ "| #80 (1177) : male female sex sexual woman\n",
+ "| #81 (1750) : induce aβ amyloid expression hippocampal\n",
+ "| #82 (906) : level t woman hormone testosterone\n",
+ "| #83 (1742) : shift flexibility ability cognitive_flexibility inhibition\n",
+ "| #84 (4638) : care need service health mental_health\n",
+ "| #85 (2135) : student school grade academic knowledge\n",
+ "| #86 (1847) : saccade target eye eye_movement gaze\n",
+ "| #87 (3539) : cell brain gene neuronal function\n",
+ "| #88 (1774) : stroke disorder ocd symptom cluster\n",
+ "| #89 (2929) : factor scale assess questionnaire validity\n",
+ "| #90 (1279) : target search item distractor visual_search\n",
+ "| #91 (3367) : amplitude erp potential event_relate component\n",
+ "| #92 (2958) : work_memory wm load information capacity\n",
+ "| #93 (2388) : training exercise session improvement train\n",
+ "| #94 (1379) : stimulation tdcs site nerve tms\n",
+ "| #95 (1722) : ii score iii sensitivity sample\n",
+ "| #96 (2905) : study identify include psychosocial review\n",
+ "| #97 (1866) : anxiety ptsd mood trauma negative\n",
+ "| #98 (2002) : strategy problem problem_solve skill reasoning\n",
+ "| #99 (2444) : area cortical pattern cortex spatial\n",
+ "|\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)\n",
+ "# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)\n",
+ "# mdl.num_beta_sample = 5\n",
+ "# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)\n",
+ "# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)\n",
+ "\n",
+ "mdl.train(0)\n",
+ "\n",
+ "print('Removed top words:', mdl.removed_top_words)\n",
+ "\n",
+ "\n",
+ "for i in tqdm(range(0, 1000, 20), 'training'):\n",
+ " mdl.train(20)\n",
+ "\n",
+ "mdl.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Batches: 100%|██████████| 101/101 [04:01<00:00, 2.40s/it]\n",
+ "2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings\n",
+ "2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP\n",
+ "2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Topic | \n",
+ " Count | \n",
+ " Name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " -1 | \n",
+ " 1075 | \n",
+ " -1_task_memory_attention_cognitive | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 140 | \n",
+ " 0_child_age_work memory_cognitive | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 138 | \n",
+ " 1_adhd_attention deficit hyperactivity_deficit... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 119 | \n",
+ " 2_schizophrenia_cognitive_symptom_patient schi... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 87 | \n",
+ " 3_neuron_rat_dopamine_inhibition | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " 58 | \n",
+ " 12 | \n",
+ " 58_schizophrenia_receptor_nmdar_nmda | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " 59 | \n",
+ " 11 | \n",
+ " 59_inhibition_stop signal_motor inhibition_res... | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " 60 | \n",
+ " 11 | \n",
+ " 60_categorical perception_categorical_stimulus... | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " 61 | \n",
+ " 11 | \n",
+ " 61_bilingual_bilingualism_monolingual_bilingua... | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " 62 | \n",
+ " 10 | \n",
+ " 62_reward_stimulus_prosocial_attention | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
64 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Topic Count Name\n",
+ "0 -1 1075 -1_task_memory_attention_cognitive\n",
+ "1 0 140 0_child_age_work memory_cognitive\n",
+ "2 1 138 1_adhd_attention deficit hyperactivity_deficit...\n",
+ "3 2 119 2_schizophrenia_cognitive_symptom_patient schi...\n",
+ "4 3 87 3_neuron_rat_dopamine_inhibition\n",
+ ".. ... ... ...\n",
+ "59 58 12 58_schizophrenia_receptor_nmdar_nmda\n",
+ "60 59 11 59_inhibition_stop signal_motor inhibition_res...\n",
+ "61 60 11 60_categorical perception_categorical_stimulus...\n",
+ "62 61 11 61_bilingual_bilingualism_monolingual_bilingua...\n",
+ "63 62 10 62_reward_stimulus_prosocial_attention\n",
+ "\n",
+ "[64 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "!pip install bertopic -Uq\n",
+ "\n",
+ "import pandas as pd\n",
+ "from bertopic import BERTopic\n",
+ "\n",
+ "DATA_FRACTION = .2\n",
+ "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')\n",
+ "PUBMED['label'] = PUBMED['subcategory']\n",
+ "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n",
+ "PUBMED = PUBMED.dropna(subset=['abstract'])\n",
+ "\n",
+ "model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)\n",
+ "topics, scores = model.fit_transform(PUBMED['abstracts'])\n",
+ "model.get_topic_info()"
+ ]
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "266722041ed6426a0a88c0d75e9dd39659f44e3a6fea07300cd13bea36eb387d"
+ },
+ "kernelspec": {
+ "display_name": "Python 3.9.7 64-bit ('py3': conda)",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}