diff --git a/tfp_coursera/topic_modeling_playground.ipynb b/tfp_coursera/topic_modeling_playground.ipynb deleted file mode 100644 index c4b88fb..0000000 --- a/tfp_coursera/topic_modeling_playground.ipynb +++ /dev/null @@ -1,572 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('4.1.2', '0.12.2')" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip install gensim -Uq\n", - "!pip install tomotopy -Uq\n", - "import gensim\n", - "import tomotopy as tp\n", - "from tqdm import tqdm\n", - "\n", - "gensim.__version__, tp.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pmiddoiyearjournal_titlejournal_iso_abbreviationtitleabstractcategorysubcategorylabel
15152669080710.1016/j.neuroimage.2015.12.0142016NeuroImageNeuroimageWhen opportunity meets motivation: Neural enga...social reward process dopaminergic mediate bra...CognitiveConstructRewardProcessingRewardProcessing
3167383161041010.1016/j.psyneuen.2019.1044722020PsychoneuroendocrinologyPsychoneuroendocrinologyAssociation between sleep duration and executi...executive function define set cognitive skill ...CognitiveTaskTMT_-_Trail_Making_TaskTMT_-_Trail_Making_Task
1858602172053110.3389/fnevo.2011.000012011Frontiers in evolutionary neuroscienceFront Evol NeurosciCan we measure memes?meme fundamental unit cultural evolution leave...CognitiveConstructInitiationInitiation
\n", - "
" - ], - "text/plain": [ - " pmid doi year \\\n", - "1515 26690807 10.1016/j.neuroimage.2015.12.014 2016 \n", - "316738 31610410 10.1016/j.psyneuen.2019.104472 2020 \n", - "185860 21720531 10.3389/fnevo.2011.00001 2011 \n", - "\n", - " journal_title journal_iso_abbreviation \\\n", - "1515 NeuroImage Neuroimage \n", - "316738 Psychoneuroendocrinology Psychoneuroendocrinology \n", - "185860 Frontiers in evolutionary neuroscience Front Evol Neurosci \n", - "\n", - " title \\\n", - "1515 When opportunity meets motivation: Neural enga... \n", - "316738 Association between sleep duration and executi... \n", - "185860 Can we measure memes? \n", - "\n", - " abstract category \\\n", - "1515 social reward process dopaminergic mediate bra... CognitiveConstruct \n", - "316738 executive function define set cognitive skill ... CognitiveTask \n", - "185860 meme fundamental unit cultural evolution leave... CognitiveConstruct \n", - "\n", - " subcategory label \n", - "1515 RewardProcessing RewardProcessing \n", - "316738 TMT_-_Trail_Making_Task TMT_-_Trail_Making_Task \n", - "185860 Initiation Initiation " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# data\n", - "\n", - "import pandas as pd\n", - "\n", - "DATA_FRACTION = .01\n", - "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')\n", - "PUBMED['label'] = PUBMED['subcategory']\n", - "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", - "PUBMED = PUBMED.dropna(subset=['abstract'])\n", - "\n", - "PUBMED.sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs = PUBMED.abstract.to_list()\n", - "\n", - "corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())\n", - "corpus.process(docs)\n", - "phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)\n", - "corpus.concat_ngrams(phrases, delimiter='_')\n", - "corpus[2]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "training: 100%|██████████| 50/50 [00:14<00:00, 3.53it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "| LDAModel (current version: 0.12.2)\n", - "| 3223 docs, 306091 words\n", - "| Total Vocabs: 16648, Used Vocabs: 4511\n", - "| Entropy of words: 7.72616\n", - "| Entropy of term-weighted words: 7.72616\n", - "| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus\n", - "|\n", - "\n", - "| Iterations: 1000, Burn-in steps: 0\n", - "| Optimization Interval: 10\n", - "| Log-likelihood per word: -7.66864\n", - "|\n", - "\n", - "| tw: TermWeight.ONE\n", - "| min_cf: 10 (minimum collection frequency of words)\n", - "| min_df: 5 (minimum document frequency of words)\n", - "| rm_top: 20 (the number of top words to be removed)\n", - "| k: 100 (the number of topics between 1 ~ 32767)\n", - "| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)\n", - "| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)\n", - "| seed: 3272832591 (random seed)\n", - "| trained in version 0.12.2\n", - "|\n", - "\n", - "| alpha (Dirichlet prior on the per-document topic distributions)\n", - "| [0.04364107 0.01068732 0.02508776 0.03441986 0.01844739 0.07034776\n", - "| 0.03123164 0.01434407 0.04665712 0.16142593 0.02718705 0.06977499\n", - "| 0.03755865 0.07443139 0.09644692 0.02221145 0.03520738 0.02862341\n", - "| 0.03875916 0.00740024 0.09041092 0.01950175 0.05477431 0.19435379\n", - "| 0.0582406 0.03943058 0.01880877 0.04164614 0.01150252 0.01499599\n", - "| 0.01659403 0.01239366 0.0205057 0.10825807 0.0132186 0.02321242\n", - "| 0.03782552 0.06962249 0.01563781 0.04341785 0.01747116 0.02702065\n", - "| 0.03356255 0.01622541 0.01230451 0.803505 0.03779493 0.03383209\n", - "| 0.09095947 0.19676776 0.01925627 0.01540721 0.04677161 0.05433749\n", - "| 0.01273535 0.01604453 0.02718877 0.04785496 0.04703167 0.02565646\n", - "| 0.02726313 0.01356411 0.17673622 0.01084241 0.01024968 0.02239775\n", - "| 0.03813412 0.02604754 0.023192 0.09352055 0.02686057 0.00997292\n", - "| 0.03537464 0.08052384 0.01368217 0.014366 0.03178743 0.01446226\n", - "| 0.02323432 0.07920136 0.02016016 0.01608375 0.0120501 0.02995271\n", - "| 0.05394103 0.03003465 0.01996591 0.03699729 0.0255818 0.04365746\n", - "| 0.01982757 0.04235343 0.04885878 0.03152624 0.0164482 0.0290758\n", - "| 0.04606192 0.02331102 0.0384533 0.04088598]\n", - "| eta (Dirichlet prior on the per-topic word distribution)\n", - "| 0.01\n", - "|\n", - "\n", - "| #0 (2986) : inhibition response_inhibition inhibitory_control stop impulsivity\n", - "| #1 (1184) : pd parkinson_disease pd_patient motor dbs\n", - "| #2 (1677) : motivation action effort reward cpt\n", - "| #3 (2314) : auditory visual perception sound sensory\n", - "| #4 (999) : r number inhibition s lie\n", - "| #5 (3681) : model base analysis different datum\n", - "| #6 (2372) : reward decision_making decision choice behavior\n", - "| #7 (1073) : injury tbi head traumatic_brain moral\n", - "| #8 (2837) : age old old_adult adult young\n", - "| #9 (7538) : use provide method tool datum\n", - "| #10 (3016) : neuron cell spike channel activity\n", - "| #11 (3482) : reaction_time trial accuracy slow fast\n", - "| #12 (2484) : lesion system nucleus dorsal pathway\n", - "| #13 (4177) : disease disorder onset clinical syndrome\n", - "| #14 (5125) : experiment model process information account\n", - "| #15 (1643) : surgery case cerebral surgical postoperative\n", - "| #16 (2470) : emotional emotion affective negative face\n", - "| #17 (2939) : adhd hyperactivity attention_deficit disorder symptom\n", - "| #18 (3154) : brain volume mri structural lesion\n", - "| #19 (856) : smoker nicotine smoking smoke craving\n", - "| #20 (5716) : p score subject assess scale\n", - "| #21 (1829) : mrna brain expression gut acid\n", - "| #22 (4467) : rat administration dose drug day\n", - "| #23 (12672) : function deficit executive executive_function impairment\n", - "| #24 (3082) : adolescent adult early development age\n", - "| #25 (2214) : attentional information threat bias processing\n", - "| #26 (1315) : sleep pain ds chronic fatigue\n", - "| #27 (3506) : schizophrenia symptom deficit psychosis patient_schizophrenia\n", - "| #28 (915) : epilepsy seizure hd sign tle\n", - "| #29 (1615) : mutation gene variant genetic protein\n", - "| #30 (1509) : use user cocaine drug cannabis\n", - "| #31 (1295) : white_matter fa integrity tract cp\n", - "| #32 (1200) : n hc compare non es\n", - "| #33 (4691) : high low individual level state\n", - "| #34 (1280) : food weight obesity bmi eat\n", - "| #35 (1365) : d c b intensity acute\n", - "| #36 (2645) : depression symptom mood depressive_symptom depressive\n", - "| #37 (3946) : social self people experience support\n", - "| #38 (1357) : tumor glioma treatment brain_tumor brain\n", - "| #39 (5041) : mouse receptor induce protein dopamine\n", - "| #40 (1395) : mindfulness meditation thought experience self\n", - "| #41 (1655) : stress psychological cope caregiver health\n", - "| #42 (1855) : condition dual_task walk gait experiment\n", - "| #43 (1269) : asd disorder autism td autistic\n", - "| #44 (1122) : speech suicide vwm noise suicidal\n", - "| #45 (37001) : finding suggest present find different\n", - "| #46 (2553) : motor movement hand action execution\n", - "| #47 (3329) : network connectivity functional brain functional_connectivity\n", - "| #48 (4707) : signal activity process event temporal\n", - "| #49 (11143) : research review approach provide process\n", - "| #50 (1386) : reading read processing sentence word\n", - "| #51 (881) : ms sequence category implicit explicit\n", - "| #52 (4167) : ef parent behavior problem family\n", - "| #53 (4081) : target cue location spatial attentional\n", - "| #54 (697) : figure bias ability matrix complex\n", - "| #55 (1464) : infant maternal mother birth prenatal\n", - "| #56 (2808) : ad dementia mci alzheimer_disease impairment\n", - "| #57 (2911) : object visual face novel processing\n", - "| #58 (2942) : recall retrieval episodic_memory item word\n", - "| #59 (1782) : conflict error trial monitoring congruent\n", - "| #60 (1890) : word interference stroop color present\n", - "| #61 (1189) : capacity functional cbt al domain\n", - "| #62 (10154) : year association score predict low\n", - "| #63 (1339) : genotype association polymorphism genetic gene\n", - "| #64 (766) : mdd apathy fc cortisol axis\n", - "| #65 (1730) : level plasma csf blood β\n", - "| #66 (2108) : learning learn feedback skill new\n", - "| #67 (2546) : eeg activity power theta alpha\n", - "| #68 (1367) : context cost switch cue repeat\n", - "| #69 (7060) : treatment intervention week improve month\n", - "| #70 (1886) : pfc activation dlpfc activity prefrontal_cortex\n", - "| #71 (750) : creativity creative pa thinking clozapine\n", - "| #72 (2488) : risk ci hiv prevalence use\n", - "| #73 (7708) : activation region cortex right area\n", - "| #74 (1017) : aggression social aggressive offender li\n", - "| #75 (1312) : alcohol drinking alcohol_use drink substance_use\n", - "| #76 (2292) : right left orientation body space\n", - "| #77 (1210) : rat da se l animal\n", - "| #78 (1888) : language word semantic bilingual fluency\n", - "| #79 (3829) : time delay duration interval day\n", - "| #80 (1177) : male female sex sexual woman\n", - "| #81 (1750) : induce aβ amyloid expression hippocampal\n", - "| #82 (906) : level t woman hormone testosterone\n", - "| #83 (1742) : shift flexibility ability cognitive_flexibility inhibition\n", - "| #84 (4638) : care need service health mental_health\n", - "| #85 (2135) : student school grade academic knowledge\n", - "| #86 (1847) : saccade target eye eye_movement gaze\n", - "| #87 (3539) : cell brain gene neuronal function\n", - "| #88 (1774) : stroke disorder ocd symptom cluster\n", - "| #89 (2929) : factor scale assess questionnaire validity\n", - "| #90 (1279) : target search item distractor visual_search\n", - "| #91 (3367) : amplitude erp potential event_relate component\n", - "| #92 (2958) : work_memory wm load information capacity\n", - "| #93 (2388) : training exercise session improvement train\n", - "| #94 (1379) : stimulation tdcs site nerve tms\n", - "| #95 (1722) : ii score iii sensitivity sample\n", - "| #96 (2905) : study identify include psychosocial review\n", - "| #97 (1866) : anxiety ptsd mood trauma negative\n", - "| #98 (2002) : strategy problem problem_solve skill reasoning\n", - "| #99 (2444) : area cortical pattern cortex spatial\n", - "|\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "\n", - "\n", - "mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)\n", - "# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)\n", - "# mdl.num_beta_sample = 5\n", - "# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)\n", - "# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)\n", - "\n", - "mdl.train(0)\n", - "\n", - "print('Removed top words:', mdl.removed_top_words)\n", - "\n", - "\n", - "for i in tqdm(range(0, 1000, 20), 'training'):\n", - " mdl.train(20)\n", - "\n", - "mdl.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 101/101 [04:01<00:00, 2.40s/it]\n", - "2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings\n", - "2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP\n", - "2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountName
0-11075-1_task_memory_attention_cognitive
101400_child_age_work memory_cognitive
211381_adhd_attention deficit hyperactivity_deficit...
321192_schizophrenia_cognitive_symptom_patient schi...
43873_neuron_rat_dopamine_inhibition
............
59581258_schizophrenia_receptor_nmdar_nmda
60591159_inhibition_stop signal_motor inhibition_res...
61601160_categorical perception_categorical_stimulus...
62611161_bilingual_bilingualism_monolingual_bilingua...
63621062_reward_stimulus_prosocial_attention
\n", - "

64 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " Topic Count Name\n", - "0 -1 1075 -1_task_memory_attention_cognitive\n", - "1 0 140 0_child_age_work memory_cognitive\n", - "2 1 138 1_adhd_attention deficit hyperactivity_deficit...\n", - "3 2 119 2_schizophrenia_cognitive_symptom_patient schi...\n", - "4 3 87 3_neuron_rat_dopamine_inhibition\n", - ".. ... ... ...\n", - "59 58 12 58_schizophrenia_receptor_nmdar_nmda\n", - "60 59 11 59_inhibition_stop signal_motor inhibition_res...\n", - "61 60 11 60_categorical perception_categorical_stimulus...\n", - "62 61 11 61_bilingual_bilingualism_monolingual_bilingua...\n", - "63 62 10 62_reward_stimulus_prosocial_attention\n", - "\n", - "[64 rows x 3 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip install bertopic -Uq\n", - "\n", - "import pandas as pd\n", - "from bertopic import BERTopic\n", - "\n", - "DATA_FRACTION = .2\n", - "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')\n", - "PUBMED['label'] = PUBMED['subcategory']\n", - "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", - "PUBMED = PUBMED.dropna(subset=['abstract'])\n", - "\n", - "model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)\n", - "topics, scores = model.fit_transform(PUBMED['abstracts'])\n", - "model.get_topic_info()" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "266722041ed6426a0a88c0d75e9dd39659f44e3a6fea07300cd13bea36eb387d" - }, - "kernelspec": { - "display_name": "Python 3.9.7 64-bit ('py3': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tfp_coursera/topic_modeling_playground.ipynb b/tfp_coursera/topic_modeling_playground.ipynb deleted file mode 100644 index c4b88fb..0000000 --- a/tfp_coursera/topic_modeling_playground.ipynb +++ /dev/null @@ -1,572 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('4.1.2', '0.12.2')" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip install gensim -Uq\n", - "!pip install tomotopy -Uq\n", - "import gensim\n", - "import tomotopy as tp\n", - "from tqdm import tqdm\n", - "\n", - "gensim.__version__, tp.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pmiddoiyearjournal_titlejournal_iso_abbreviationtitleabstractcategorysubcategorylabel
15152669080710.1016/j.neuroimage.2015.12.0142016NeuroImageNeuroimageWhen opportunity meets motivation: Neural enga...social reward process dopaminergic mediate bra...CognitiveConstructRewardProcessingRewardProcessing
3167383161041010.1016/j.psyneuen.2019.1044722020PsychoneuroendocrinologyPsychoneuroendocrinologyAssociation between sleep duration and executi...executive function define set cognitive skill ...CognitiveTaskTMT_-_Trail_Making_TaskTMT_-_Trail_Making_Task
1858602172053110.3389/fnevo.2011.000012011Frontiers in evolutionary neuroscienceFront Evol NeurosciCan we measure memes?meme fundamental unit cultural evolution leave...CognitiveConstructInitiationInitiation
\n", - "
" - ], - "text/plain": [ - " pmid doi year \\\n", - "1515 26690807 10.1016/j.neuroimage.2015.12.014 2016 \n", - "316738 31610410 10.1016/j.psyneuen.2019.104472 2020 \n", - "185860 21720531 10.3389/fnevo.2011.00001 2011 \n", - "\n", - " journal_title journal_iso_abbreviation \\\n", - "1515 NeuroImage Neuroimage \n", - "316738 Psychoneuroendocrinology Psychoneuroendocrinology \n", - "185860 Frontiers in evolutionary neuroscience Front Evol Neurosci \n", - "\n", - " title \\\n", - "1515 When opportunity meets motivation: Neural enga... \n", - "316738 Association between sleep duration and executi... \n", - "185860 Can we measure memes? \n", - "\n", - " abstract category \\\n", - "1515 social reward process dopaminergic mediate bra... CognitiveConstruct \n", - "316738 executive function define set cognitive skill ... CognitiveTask \n", - "185860 meme fundamental unit cultural evolution leave... CognitiveConstruct \n", - "\n", - " subcategory label \n", - "1515 RewardProcessing RewardProcessing \n", - "316738 TMT_-_Trail_Making_Task TMT_-_Trail_Making_Task \n", - "185860 Initiation Initiation " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# data\n", - "\n", - "import pandas as pd\n", - "\n", - "DATA_FRACTION = .01\n", - "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')\n", - "PUBMED['label'] = PUBMED['subcategory']\n", - "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", - "PUBMED = PUBMED.dropna(subset=['abstract'])\n", - "\n", - "PUBMED.sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs = PUBMED.abstract.to_list()\n", - "\n", - "corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())\n", - "corpus.process(docs)\n", - "phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)\n", - "corpus.concat_ngrams(phrases, delimiter='_')\n", - "corpus[2]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "training: 100%|██████████| 50/50 [00:14<00:00, 3.53it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "| LDAModel (current version: 0.12.2)\n", - "| 3223 docs, 306091 words\n", - "| Total Vocabs: 16648, Used Vocabs: 4511\n", - "| Entropy of words: 7.72616\n", - "| Entropy of term-weighted words: 7.72616\n", - "| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus\n", - "|\n", - "\n", - "| Iterations: 1000, Burn-in steps: 0\n", - "| Optimization Interval: 10\n", - "| Log-likelihood per word: -7.66864\n", - "|\n", - "\n", - "| tw: TermWeight.ONE\n", - "| min_cf: 10 (minimum collection frequency of words)\n", - "| min_df: 5 (minimum document frequency of words)\n", - "| rm_top: 20 (the number of top words to be removed)\n", - "| k: 100 (the number of topics between 1 ~ 32767)\n", - "| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)\n", - "| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)\n", - "| seed: 3272832591 (random seed)\n", - "| trained in version 0.12.2\n", - "|\n", - "\n", - "| alpha (Dirichlet prior on the per-document topic distributions)\n", - "| [0.04364107 0.01068732 0.02508776 0.03441986 0.01844739 0.07034776\n", - "| 0.03123164 0.01434407 0.04665712 0.16142593 0.02718705 0.06977499\n", - "| 0.03755865 0.07443139 0.09644692 0.02221145 0.03520738 0.02862341\n", - "| 0.03875916 0.00740024 0.09041092 0.01950175 0.05477431 0.19435379\n", - "| 0.0582406 0.03943058 0.01880877 0.04164614 0.01150252 0.01499599\n", - "| 0.01659403 0.01239366 0.0205057 0.10825807 0.0132186 0.02321242\n", - "| 0.03782552 0.06962249 0.01563781 0.04341785 0.01747116 0.02702065\n", - "| 0.03356255 0.01622541 0.01230451 0.803505 0.03779493 0.03383209\n", - "| 0.09095947 0.19676776 0.01925627 0.01540721 0.04677161 0.05433749\n", - "| 0.01273535 0.01604453 0.02718877 0.04785496 0.04703167 0.02565646\n", - "| 0.02726313 0.01356411 0.17673622 0.01084241 0.01024968 0.02239775\n", - "| 0.03813412 0.02604754 0.023192 0.09352055 0.02686057 0.00997292\n", - "| 0.03537464 0.08052384 0.01368217 0.014366 0.03178743 0.01446226\n", - "| 0.02323432 0.07920136 0.02016016 0.01608375 0.0120501 0.02995271\n", - "| 0.05394103 0.03003465 0.01996591 0.03699729 0.0255818 0.04365746\n", - "| 0.01982757 0.04235343 0.04885878 0.03152624 0.0164482 0.0290758\n", - "| 0.04606192 0.02331102 0.0384533 0.04088598]\n", - "| eta (Dirichlet prior on the per-topic word distribution)\n", - "| 0.01\n", - "|\n", - "\n", - "| #0 (2986) : inhibition response_inhibition inhibitory_control stop impulsivity\n", - "| #1 (1184) : pd parkinson_disease pd_patient motor dbs\n", - "| #2 (1677) : motivation action effort reward cpt\n", - "| #3 (2314) : auditory visual perception sound sensory\n", - "| #4 (999) : r number inhibition s lie\n", - "| #5 (3681) : model base analysis different datum\n", - "| #6 (2372) : reward decision_making decision choice behavior\n", - "| #7 (1073) : injury tbi head traumatic_brain moral\n", - "| #8 (2837) : age old old_adult adult young\n", - "| #9 (7538) : use provide method tool datum\n", - "| #10 (3016) : neuron cell spike channel activity\n", - "| #11 (3482) : reaction_time trial accuracy slow fast\n", - "| #12 (2484) : lesion system nucleus dorsal pathway\n", - "| #13 (4177) : disease disorder onset clinical syndrome\n", - "| #14 (5125) : experiment model process information account\n", - "| #15 (1643) : surgery case cerebral surgical postoperative\n", - "| #16 (2470) : emotional emotion affective negative face\n", - "| #17 (2939) : adhd hyperactivity attention_deficit disorder symptom\n", - "| #18 (3154) : brain volume mri structural lesion\n", - "| #19 (856) : smoker nicotine smoking smoke craving\n", - "| #20 (5716) : p score subject assess scale\n", - "| #21 (1829) : mrna brain expression gut acid\n", - "| #22 (4467) : rat administration dose drug day\n", - "| #23 (12672) : function deficit executive executive_function impairment\n", - "| #24 (3082) : adolescent adult early development age\n", - "| #25 (2214) : attentional information threat bias processing\n", - "| #26 (1315) : sleep pain ds chronic fatigue\n", - "| #27 (3506) : schizophrenia symptom deficit psychosis patient_schizophrenia\n", - "| #28 (915) : epilepsy seizure hd sign tle\n", - "| #29 (1615) : mutation gene variant genetic protein\n", - "| #30 (1509) : use user cocaine drug cannabis\n", - "| #31 (1295) : white_matter fa integrity tract cp\n", - "| #32 (1200) : n hc compare non es\n", - "| #33 (4691) : high low individual level state\n", - "| #34 (1280) : food weight obesity bmi eat\n", - "| #35 (1365) : d c b intensity acute\n", - "| #36 (2645) : depression symptom mood depressive_symptom depressive\n", - "| #37 (3946) : social self people experience support\n", - "| #38 (1357) : tumor glioma treatment brain_tumor brain\n", - "| #39 (5041) : mouse receptor induce protein dopamine\n", - "| #40 (1395) : mindfulness meditation thought experience self\n", - "| #41 (1655) : stress psychological cope caregiver health\n", - "| #42 (1855) : condition dual_task walk gait experiment\n", - "| #43 (1269) : asd disorder autism td autistic\n", - "| #44 (1122) : speech suicide vwm noise suicidal\n", - "| #45 (37001) : finding suggest present find different\n", - "| #46 (2553) : motor movement hand action execution\n", - "| #47 (3329) : network connectivity functional brain functional_connectivity\n", - "| #48 (4707) : signal activity process event temporal\n", - "| #49 (11143) : research review approach provide process\n", - "| #50 (1386) : reading read processing sentence word\n", - "| #51 (881) : ms sequence category implicit explicit\n", - "| #52 (4167) : ef parent behavior problem family\n", - "| #53 (4081) : target cue location spatial attentional\n", - "| #54 (697) : figure bias ability matrix complex\n", - "| #55 (1464) : infant maternal mother birth prenatal\n", - "| #56 (2808) : ad dementia mci alzheimer_disease impairment\n", - "| #57 (2911) : object visual face novel processing\n", - "| #58 (2942) : recall retrieval episodic_memory item word\n", - "| #59 (1782) : conflict error trial monitoring congruent\n", - "| #60 (1890) : word interference stroop color present\n", - "| #61 (1189) : capacity functional cbt al domain\n", - "| #62 (10154) : year association score predict low\n", - "| #63 (1339) : genotype association polymorphism genetic gene\n", - "| #64 (766) : mdd apathy fc cortisol axis\n", - "| #65 (1730) : level plasma csf blood β\n", - "| #66 (2108) : learning learn feedback skill new\n", - "| #67 (2546) : eeg activity power theta alpha\n", - "| #68 (1367) : context cost switch cue repeat\n", - "| #69 (7060) : treatment intervention week improve month\n", - "| #70 (1886) : pfc activation dlpfc activity prefrontal_cortex\n", - "| #71 (750) : creativity creative pa thinking clozapine\n", - "| #72 (2488) : risk ci hiv prevalence use\n", - "| #73 (7708) : activation region cortex right area\n", - "| #74 (1017) : aggression social aggressive offender li\n", - "| #75 (1312) : alcohol drinking alcohol_use drink substance_use\n", - "| #76 (2292) : right left orientation body space\n", - "| #77 (1210) : rat da se l animal\n", - "| #78 (1888) : language word semantic bilingual fluency\n", - "| #79 (3829) : time delay duration interval day\n", - "| #80 (1177) : male female sex sexual woman\n", - "| #81 (1750) : induce aβ amyloid expression hippocampal\n", - "| #82 (906) : level t woman hormone testosterone\n", - "| #83 (1742) : shift flexibility ability cognitive_flexibility inhibition\n", - "| #84 (4638) : care need service health mental_health\n", - "| #85 (2135) : student school grade academic knowledge\n", - "| #86 (1847) : saccade target eye eye_movement gaze\n", - "| #87 (3539) : cell brain gene neuronal function\n", - "| #88 (1774) : stroke disorder ocd symptom cluster\n", - "| #89 (2929) : factor scale assess questionnaire validity\n", - "| #90 (1279) : target search item distractor visual_search\n", - "| #91 (3367) : amplitude erp potential event_relate component\n", - "| #92 (2958) : work_memory wm load information capacity\n", - "| #93 (2388) : training exercise session improvement train\n", - "| #94 (1379) : stimulation tdcs site nerve tms\n", - "| #95 (1722) : ii score iii sensitivity sample\n", - "| #96 (2905) : study identify include psychosocial review\n", - "| #97 (1866) : anxiety ptsd mood trauma negative\n", - "| #98 (2002) : strategy problem problem_solve skill reasoning\n", - "| #99 (2444) : area cortical pattern cortex spatial\n", - "|\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "\n", - "\n", - "mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)\n", - "# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)\n", - "# mdl.num_beta_sample = 5\n", - "# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)\n", - "# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)\n", - "\n", - "mdl.train(0)\n", - "\n", - "print('Removed top words:', mdl.removed_top_words)\n", - "\n", - "\n", - "for i in tqdm(range(0, 1000, 20), 'training'):\n", - " mdl.train(20)\n", - "\n", - "mdl.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 101/101 [04:01<00:00, 2.40s/it]\n", - "2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings\n", - "2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP\n", - "2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountName
0-11075-1_task_memory_attention_cognitive
101400_child_age_work memory_cognitive
211381_adhd_attention deficit hyperactivity_deficit...
321192_schizophrenia_cognitive_symptom_patient schi...
43873_neuron_rat_dopamine_inhibition
............
59581258_schizophrenia_receptor_nmdar_nmda
60591159_inhibition_stop signal_motor inhibition_res...
61601160_categorical perception_categorical_stimulus...
62611161_bilingual_bilingualism_monolingual_bilingua...
63621062_reward_stimulus_prosocial_attention
\n", - "

64 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " Topic Count Name\n", - "0 -1 1075 -1_task_memory_attention_cognitive\n", - "1 0 140 0_child_age_work memory_cognitive\n", - "2 1 138 1_adhd_attention deficit hyperactivity_deficit...\n", - "3 2 119 2_schizophrenia_cognitive_symptom_patient schi...\n", - "4 3 87 3_neuron_rat_dopamine_inhibition\n", - ".. ... ... ...\n", - "59 58 12 58_schizophrenia_receptor_nmdar_nmda\n", - "60 59 11 59_inhibition_stop signal_motor inhibition_res...\n", - "61 60 11 60_categorical perception_categorical_stimulus...\n", - "62 61 11 61_bilingual_bilingualism_monolingual_bilingua...\n", - "63 62 10 62_reward_stimulus_prosocial_attention\n", - "\n", - "[64 rows x 3 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip install bertopic -Uq\n", - "\n", - "import pandas as pd\n", - "from bertopic import BERTopic\n", - "\n", - "DATA_FRACTION = .2\n", - "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')\n", - "PUBMED['label'] = PUBMED['subcategory']\n", - "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", - "PUBMED = PUBMED.dropna(subset=['abstract'])\n", - "\n", - "model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)\n", - "topics, scores = model.fit_transform(PUBMED['abstracts'])\n", - "model.get_topic_info()" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "266722041ed6426a0a88c0d75e9dd39659f44e3a6fea07300cd13bea36eb387d" - }, - "kernelspec": { - "display_name": "Python 3.9.7 64-bit ('py3': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/topic_modeling_playground.ipynb b/topic_modeling_playground.ipynb new file mode 100644 index 0000000..c4b88fb --- /dev/null +++ b/topic_modeling_playground.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('4.1.2', '0.12.2')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!pip install gensim -Uq\n", + "!pip install tomotopy -Uq\n", + "import gensim\n", + "import tomotopy as tp\n", + "from tqdm import tqdm\n", + "\n", + "gensim.__version__, tp.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pmiddoiyearjournal_titlejournal_iso_abbreviationtitleabstractcategorysubcategorylabel
15152669080710.1016/j.neuroimage.2015.12.0142016NeuroImageNeuroimageWhen opportunity meets motivation: Neural enga...social reward process dopaminergic mediate bra...CognitiveConstructRewardProcessingRewardProcessing
3167383161041010.1016/j.psyneuen.2019.1044722020PsychoneuroendocrinologyPsychoneuroendocrinologyAssociation between sleep duration and executi...executive function define set cognitive skill ...CognitiveTaskTMT_-_Trail_Making_TaskTMT_-_Trail_Making_Task
1858602172053110.3389/fnevo.2011.000012011Frontiers in evolutionary neuroscienceFront Evol NeurosciCan we measure memes?meme fundamental unit cultural evolution leave...CognitiveConstructInitiationInitiation
\n", + "
" + ], + "text/plain": [ + " pmid doi year \\\n", + "1515 26690807 10.1016/j.neuroimage.2015.12.014 2016 \n", + "316738 31610410 10.1016/j.psyneuen.2019.104472 2020 \n", + "185860 21720531 10.3389/fnevo.2011.00001 2011 \n", + "\n", + " journal_title journal_iso_abbreviation \\\n", + "1515 NeuroImage Neuroimage \n", + "316738 Psychoneuroendocrinology Psychoneuroendocrinology \n", + "185860 Frontiers in evolutionary neuroscience Front Evol Neurosci \n", + "\n", + " title \\\n", + "1515 When opportunity meets motivation: Neural enga... \n", + "316738 Association between sleep duration and executi... \n", + "185860 Can we measure memes? \n", + "\n", + " abstract category \\\n", + "1515 social reward process dopaminergic mediate bra... CognitiveConstruct \n", + "316738 executive function define set cognitive skill ... CognitiveTask \n", + "185860 meme fundamental unit cultural evolution leave... CognitiveConstruct \n", + "\n", + " subcategory label \n", + "1515 RewardProcessing RewardProcessing \n", + "316738 TMT_-_Trail_Making_Task TMT_-_Trail_Making_Task \n", + "185860 Initiation Initiation " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data\n", + "\n", + "import pandas as pd\n", + "\n", + "DATA_FRACTION = .01\n", + "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')\n", + "PUBMED['label'] = PUBMED['subcategory']\n", + "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", + "PUBMED = PUBMED.dropna(subset=['abstract'])\n", + "\n", + "PUBMED.sample(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = PUBMED.abstract.to_list()\n", + "\n", + "corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())\n", + "corpus.process(docs)\n", + "phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)\n", + "corpus.concat_ngrams(phrases, delimiter='_')\n", + "corpus[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "training: 100%|██████████| 50/50 [00:14<00:00, 3.53it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "| LDAModel (current version: 0.12.2)\n", + "| 3223 docs, 306091 words\n", + "| Total Vocabs: 16648, Used Vocabs: 4511\n", + "| Entropy of words: 7.72616\n", + "| Entropy of term-weighted words: 7.72616\n", + "| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus\n", + "|\n", + "\n", + "| Iterations: 1000, Burn-in steps: 0\n", + "| Optimization Interval: 10\n", + "| Log-likelihood per word: -7.66864\n", + "|\n", + "\n", + "| tw: TermWeight.ONE\n", + "| min_cf: 10 (minimum collection frequency of words)\n", + "| min_df: 5 (minimum document frequency of words)\n", + "| rm_top: 20 (the number of top words to be removed)\n", + "| k: 100 (the number of topics between 1 ~ 32767)\n", + "| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)\n", + "| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)\n", + "| seed: 3272832591 (random seed)\n", + "| trained in version 0.12.2\n", + "|\n", + "\n", + "| alpha (Dirichlet prior on the per-document topic distributions)\n", + "| [0.04364107 0.01068732 0.02508776 0.03441986 0.01844739 0.07034776\n", + "| 0.03123164 0.01434407 0.04665712 0.16142593 0.02718705 0.06977499\n", + "| 0.03755865 0.07443139 0.09644692 0.02221145 0.03520738 0.02862341\n", + "| 0.03875916 0.00740024 0.09041092 0.01950175 0.05477431 0.19435379\n", + "| 0.0582406 0.03943058 0.01880877 0.04164614 0.01150252 0.01499599\n", + "| 0.01659403 0.01239366 0.0205057 0.10825807 0.0132186 0.02321242\n", + "| 0.03782552 0.06962249 0.01563781 0.04341785 0.01747116 0.02702065\n", + "| 0.03356255 0.01622541 0.01230451 0.803505 0.03779493 0.03383209\n", + "| 0.09095947 0.19676776 0.01925627 0.01540721 0.04677161 0.05433749\n", + "| 0.01273535 0.01604453 0.02718877 0.04785496 0.04703167 0.02565646\n", + "| 0.02726313 0.01356411 0.17673622 0.01084241 0.01024968 0.02239775\n", + "| 0.03813412 0.02604754 0.023192 0.09352055 0.02686057 0.00997292\n", + "| 0.03537464 0.08052384 0.01368217 0.014366 0.03178743 0.01446226\n", + "| 0.02323432 0.07920136 0.02016016 0.01608375 0.0120501 0.02995271\n", + "| 0.05394103 0.03003465 0.01996591 0.03699729 0.0255818 0.04365746\n", + "| 0.01982757 0.04235343 0.04885878 0.03152624 0.0164482 0.0290758\n", + "| 0.04606192 0.02331102 0.0384533 0.04088598]\n", + "| eta (Dirichlet prior on the per-topic word distribution)\n", + "| 0.01\n", + "|\n", + "\n", + "| #0 (2986) : inhibition response_inhibition inhibitory_control stop impulsivity\n", + "| #1 (1184) : pd parkinson_disease pd_patient motor dbs\n", + "| #2 (1677) : motivation action effort reward cpt\n", + "| #3 (2314) : auditory visual perception sound sensory\n", + "| #4 (999) : r number inhibition s lie\n", + "| #5 (3681) : model base analysis different datum\n", + "| #6 (2372) : reward decision_making decision choice behavior\n", + "| #7 (1073) : injury tbi head traumatic_brain moral\n", + "| #8 (2837) : age old old_adult adult young\n", + "| #9 (7538) : use provide method tool datum\n", + "| #10 (3016) : neuron cell spike channel activity\n", + "| #11 (3482) : reaction_time trial accuracy slow fast\n", + "| #12 (2484) : lesion system nucleus dorsal pathway\n", + "| #13 (4177) : disease disorder onset clinical syndrome\n", + "| #14 (5125) : experiment model process information account\n", + "| #15 (1643) : surgery case cerebral surgical postoperative\n", + "| #16 (2470) : emotional emotion affective negative face\n", + "| #17 (2939) : adhd hyperactivity attention_deficit disorder symptom\n", + "| #18 (3154) : brain volume mri structural lesion\n", + "| #19 (856) : smoker nicotine smoking smoke craving\n", + "| #20 (5716) : p score subject assess scale\n", + "| #21 (1829) : mrna brain expression gut acid\n", + "| #22 (4467) : rat administration dose drug day\n", + "| #23 (12672) : function deficit executive executive_function impairment\n", + "| #24 (3082) : adolescent adult early development age\n", + "| #25 (2214) : attentional information threat bias processing\n", + "| #26 (1315) : sleep pain ds chronic fatigue\n", + "| #27 (3506) : schizophrenia symptom deficit psychosis patient_schizophrenia\n", + "| #28 (915) : epilepsy seizure hd sign tle\n", + "| #29 (1615) : mutation gene variant genetic protein\n", + "| #30 (1509) : use user cocaine drug cannabis\n", + "| #31 (1295) : white_matter fa integrity tract cp\n", + "| #32 (1200) : n hc compare non es\n", + "| #33 (4691) : high low individual level state\n", + "| #34 (1280) : food weight obesity bmi eat\n", + "| #35 (1365) : d c b intensity acute\n", + "| #36 (2645) : depression symptom mood depressive_symptom depressive\n", + "| #37 (3946) : social self people experience support\n", + "| #38 (1357) : tumor glioma treatment brain_tumor brain\n", + "| #39 (5041) : mouse receptor induce protein dopamine\n", + "| #40 (1395) : mindfulness meditation thought experience self\n", + "| #41 (1655) : stress psychological cope caregiver health\n", + "| #42 (1855) : condition dual_task walk gait experiment\n", + "| #43 (1269) : asd disorder autism td autistic\n", + "| #44 (1122) : speech suicide vwm noise suicidal\n", + "| #45 (37001) : finding suggest present find different\n", + "| #46 (2553) : motor movement hand action execution\n", + "| #47 (3329) : network connectivity functional brain functional_connectivity\n", + "| #48 (4707) : signal activity process event temporal\n", + "| #49 (11143) : research review approach provide process\n", + "| #50 (1386) : reading read processing sentence word\n", + "| #51 (881) : ms sequence category implicit explicit\n", + "| #52 (4167) : ef parent behavior problem family\n", + "| #53 (4081) : target cue location spatial attentional\n", + "| #54 (697) : figure bias ability matrix complex\n", + "| #55 (1464) : infant maternal mother birth prenatal\n", + "| #56 (2808) : ad dementia mci alzheimer_disease impairment\n", + "| #57 (2911) : object visual face novel processing\n", + "| #58 (2942) : recall retrieval episodic_memory item word\n", + "| #59 (1782) : conflict error trial monitoring congruent\n", + "| #60 (1890) : word interference stroop color present\n", + "| #61 (1189) : capacity functional cbt al domain\n", + "| #62 (10154) : year association score predict low\n", + "| #63 (1339) : genotype association polymorphism genetic gene\n", + "| #64 (766) : mdd apathy fc cortisol axis\n", + "| #65 (1730) : level plasma csf blood β\n", + "| #66 (2108) : learning learn feedback skill new\n", + "| #67 (2546) : eeg activity power theta alpha\n", + "| #68 (1367) : context cost switch cue repeat\n", + "| #69 (7060) : treatment intervention week improve month\n", + "| #70 (1886) : pfc activation dlpfc activity prefrontal_cortex\n", + "| #71 (750) : creativity creative pa thinking clozapine\n", + "| #72 (2488) : risk ci hiv prevalence use\n", + "| #73 (7708) : activation region cortex right area\n", + "| #74 (1017) : aggression social aggressive offender li\n", + "| #75 (1312) : alcohol drinking alcohol_use drink substance_use\n", + "| #76 (2292) : right left orientation body space\n", + "| #77 (1210) : rat da se l animal\n", + "| #78 (1888) : language word semantic bilingual fluency\n", + "| #79 (3829) : time delay duration interval day\n", + "| #80 (1177) : male female sex sexual woman\n", + "| #81 (1750) : induce aβ amyloid expression hippocampal\n", + "| #82 (906) : level t woman hormone testosterone\n", + "| #83 (1742) : shift flexibility ability cognitive_flexibility inhibition\n", + "| #84 (4638) : care need service health mental_health\n", + "| #85 (2135) : student school grade academic knowledge\n", + "| #86 (1847) : saccade target eye eye_movement gaze\n", + "| #87 (3539) : cell brain gene neuronal function\n", + "| #88 (1774) : stroke disorder ocd symptom cluster\n", + "| #89 (2929) : factor scale assess questionnaire validity\n", + "| #90 (1279) : target search item distractor visual_search\n", + "| #91 (3367) : amplitude erp potential event_relate component\n", + "| #92 (2958) : work_memory wm load information capacity\n", + "| #93 (2388) : training exercise session improvement train\n", + "| #94 (1379) : stimulation tdcs site nerve tms\n", + "| #95 (1722) : ii score iii sensitivity sample\n", + "| #96 (2905) : study identify include psychosocial review\n", + "| #97 (1866) : anxiety ptsd mood trauma negative\n", + "| #98 (2002) : strategy problem problem_solve skill reasoning\n", + "| #99 (2444) : area cortical pattern cortex spatial\n", + "|\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "\n", + "\n", + "mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)\n", + "# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)\n", + "# mdl.num_beta_sample = 5\n", + "# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)\n", + "# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)\n", + "\n", + "mdl.train(0)\n", + "\n", + "print('Removed top words:', mdl.removed_top_words)\n", + "\n", + "\n", + "for i in tqdm(range(0, 1000, 20), 'training'):\n", + " mdl.train(20)\n", + "\n", + "mdl.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 101/101 [04:01<00:00, 2.40s/it]\n", + "2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings\n", + "2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP\n", + "2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountName
0-11075-1_task_memory_attention_cognitive
101400_child_age_work memory_cognitive
211381_adhd_attention deficit hyperactivity_deficit...
321192_schizophrenia_cognitive_symptom_patient schi...
43873_neuron_rat_dopamine_inhibition
............
59581258_schizophrenia_receptor_nmdar_nmda
60591159_inhibition_stop signal_motor inhibition_res...
61601160_categorical perception_categorical_stimulus...
62611161_bilingual_bilingualism_monolingual_bilingua...
63621062_reward_stimulus_prosocial_attention
\n", + "

64 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Topic Count Name\n", + "0 -1 1075 -1_task_memory_attention_cognitive\n", + "1 0 140 0_child_age_work memory_cognitive\n", + "2 1 138 1_adhd_attention deficit hyperactivity_deficit...\n", + "3 2 119 2_schizophrenia_cognitive_symptom_patient schi...\n", + "4 3 87 3_neuron_rat_dopamine_inhibition\n", + ".. ... ... ...\n", + "59 58 12 58_schizophrenia_receptor_nmdar_nmda\n", + "60 59 11 59_inhibition_stop signal_motor inhibition_res...\n", + "61 60 11 60_categorical perception_categorical_stimulus...\n", + "62 61 11 61_bilingual_bilingualism_monolingual_bilingua...\n", + "63 62 10 62_reward_stimulus_prosocial_attention\n", + "\n", + "[64 rows x 3 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!pip install bertopic -Uq\n", + "\n", + "import pandas as pd\n", + "from bertopic import BERTopic\n", + "\n", + "DATA_FRACTION = .2\n", + "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')\n", + "PUBMED['label'] = PUBMED['subcategory']\n", + "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n", + "PUBMED = PUBMED.dropna(subset=['abstract'])\n", + "\n", + "model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)\n", + "topics, scores = model.fit_transform(PUBMED['abstracts'])\n", + "model.get_topic_info()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "266722041ed6426a0a88c0d75e9dd39659f44e3a6fea07300cd13bea36eb387d" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('py3': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}