notebooks/topic_modeling_playground.ipynb at 4f7f0bb48fa4e15ea0667d54a3197974f8f31a70

Fork: 0
morteza / notebooks
Find file
Newer
Older
notebooks / topic_modeling_playground.ipynb
Morteza Ansarinia on 14 Oct 2021 24 KB move topic notebook to the root folder
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('4.1.2', '0.12.2')"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "!pip install gensim -Uq\n",
    "!pip install tomotopy -Uq\n",
    "import gensim\n",
    "import tomotopy as tp\n",
    "from tqdm import tqdm\n",
    "\n",
    "gensim.__version__, tp.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pmid</th>\n",
       "      <th>doi</th>\n",
       "      <th>year</th>\n",
       "      <th>journal_title</th>\n",
       "      <th>journal_iso_abbreviation</th>\n",
       "      <th>title</th>\n",
       "      <th>abstract</th>\n",
       "      <th>category</th>\n",
       "      <th>subcategory</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1515</th>\n",
       "      <td>26690807</td>\n",
       "      <td>10.1016/j.neuroimage.2015.12.014</td>\n",
       "      <td>2016</td>\n",
       "      <td>NeuroImage</td>\n",
       "      <td>Neuroimage</td>\n",
       "      <td>When opportunity meets motivation: Neural enga...</td>\n",
       "      <td>social reward process dopaminergic mediate bra...</td>\n",
       "      <td>CognitiveConstruct</td>\n",
       "      <td>RewardProcessing</td>\n",
       "      <td>RewardProcessing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>316738</th>\n",
       "      <td>31610410</td>\n",
       "      <td>10.1016/j.psyneuen.2019.104472</td>\n",
       "      <td>2020</td>\n",
       "      <td>Psychoneuroendocrinology</td>\n",
       "      <td>Psychoneuroendocrinology</td>\n",
       "      <td>Association between sleep duration and executi...</td>\n",
       "      <td>executive function define set cognitive skill ...</td>\n",
       "      <td>CognitiveTask</td>\n",
       "      <td>TMT_-_Trail_Making_Task</td>\n",
       "      <td>TMT_-_Trail_Making_Task</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>185860</th>\n",
       "      <td>21720531</td>\n",
       "      <td>10.3389/fnevo.2011.00001</td>\n",
       "      <td>2011</td>\n",
       "      <td>Frontiers in evolutionary neuroscience</td>\n",
       "      <td>Front Evol Neurosci</td>\n",
       "      <td>Can we measure memes?</td>\n",
       "      <td>meme fundamental unit cultural evolution leave...</td>\n",
       "      <td>CognitiveConstruct</td>\n",
       "      <td>Initiation</td>\n",
       "      <td>Initiation</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            pmid                               doi  year  \\\n",
       "1515    26690807  10.1016/j.neuroimage.2015.12.014  2016   \n",
       "316738  31610410    10.1016/j.psyneuen.2019.104472  2020   \n",
       "185860  21720531          10.3389/fnevo.2011.00001  2011   \n",
       "\n",
       "                                 journal_title  journal_iso_abbreviation  \\\n",
       "1515                                NeuroImage                Neuroimage   \n",
       "316738                Psychoneuroendocrinology  Psychoneuroendocrinology   \n",
       "185860  Frontiers in evolutionary neuroscience       Front Evol Neurosci   \n",
       "\n",
       "                                                    title  \\\n",
       "1515    When opportunity meets motivation: Neural enga...   \n",
       "316738  Association between sleep duration and executi...   \n",
       "185860                              Can we measure memes?   \n",
       "\n",
       "                                                 abstract            category  \\\n",
       "1515    social reward process dopaminergic mediate bra...  CognitiveConstruct   \n",
       "316738  executive function define set cognitive skill ...       CognitiveTask   \n",
       "185860  meme fundamental unit cultural evolution leave...  CognitiveConstruct   \n",
       "\n",
       "                    subcategory                    label  \n",
       "1515           RewardProcessing         RewardProcessing  \n",
       "316738  TMT_-_Trail_Making_Task  TMT_-_Trail_Making_Task  \n",
       "185860               Initiation               Initiation  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "DATA_FRACTION = .01\n",
    "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts_preprocessed.csv.gz')\n",
    "PUBMED['label'] = PUBMED['subcategory']\n",
    "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n",
    "PUBMED = PUBMED.dropna(subset=['abstract'])\n",
    "\n",
    "PUBMED.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tomotopy.Document with words=\"startle inhibition weak lead stimulus prepulse inhibition ppi study understand neurobiology information_processing patient community comparison_subject ccs ppi strong genetic basis infrahuman evidence heritability stability reliability human ppi gain increase use endophenotype identify vulnerability gene brain disorder include schizophrenia genetic study employ multiple geographically dispersed test site accommodate need large complex sample assess feasibility ppi multi site study site investigation multiple measure consortium genetic schizophrenia conduct methodological acoustic startle ppi ccs method manualize videotape standardize site intensive person training session equipment acquire program ppi site ucsd stringent quality assurance qa procedure testing complete ccs year primary startle dependent measure eyeblink startle magnitude habituation peak latency latency facilitation ppi analysis_identify significant variability site primary measure determined factor testing process subject characteristic influence number test measure qa procedure identify non standardized practice respect testing method procedural drift particularly relevant multi site study measure thorough oversight qa procedure measure acoustic startle ppi acquire reliably multiple testing site nonetheless site substantial expertise utilize psychophysiological measure multi site study startle ppi dependent measure require careful attention methodological procedure\">"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs = PUBMED.abstract.to_list()\n",
    "\n",
    "corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer())\n",
    "corpus.process(docs)\n",
    "phrases = corpus.extract_ngrams(min_cf=20, min_df=5, max_len=10, max_cand=1000, normalized=True)\n",
    "corpus.concat_ngrams(phrases, delimiter='_')\n",
    "corpus[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removed top words: ['task', 'patient', 'effect', 'test', 'attention', 'memory', 'group', 'cognitive', 'control', 'performance', 'measure', 'response', 'associate', 'child', 'increase', 'result', 'change', 'relate', 'participant', 'stimulus']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "training: 100%|██████████| 50/50 [00:14<00:00,  3.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<Basic Info>\n",
      "| LDAModel (current version: 0.12.2)\n",
      "| 3223 docs, 306091 words\n",
      "| Total Vocabs: 16648, Used Vocabs: 4511\n",
      "| Entropy of words: 7.72616\n",
      "| Entropy of term-weighted words: 7.72616\n",
      "| Removed Vocabs: task patient effect test attention memory group cognitive control performance measure response associate child increase result change relate participant stimulus\n",
      "|\n",
      "<Training Info>\n",
      "| Iterations: 1000, Burn-in steps: 0\n",
      "| Optimization Interval: 10\n",
      "| Log-likelihood per word: -7.66864\n",
      "|\n",
      "<Initial Parameters>\n",
      "| tw: TermWeight.ONE\n",
      "| min_cf: 10 (minimum collection frequency of words)\n",
      "| min_df: 5 (minimum document frequency of words)\n",
      "| rm_top: 20 (the number of top words to be removed)\n",
      "| k: 100 (the number of topics between 1 ~ 32767)\n",
      "| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)\n",
      "| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)\n",
      "| seed: 3272832591 (random seed)\n",
      "| trained in version 0.12.2\n",
      "|\n",
      "<Parameters>\n",
      "| alpha (Dirichlet prior on the per-document topic distributions)\n",
      "|  [0.04364107 0.01068732 0.02508776 0.03441986 0.01844739 0.07034776\n",
      "|   0.03123164 0.01434407 0.04665712 0.16142593 0.02718705 0.06977499\n",
      "|   0.03755865 0.07443139 0.09644692 0.02221145 0.03520738 0.02862341\n",
      "|   0.03875916 0.00740024 0.09041092 0.01950175 0.05477431 0.19435379\n",
      "|   0.0582406  0.03943058 0.01880877 0.04164614 0.01150252 0.01499599\n",
      "|   0.01659403 0.01239366 0.0205057  0.10825807 0.0132186  0.02321242\n",
      "|   0.03782552 0.06962249 0.01563781 0.04341785 0.01747116 0.02702065\n",
      "|   0.03356255 0.01622541 0.01230451 0.803505   0.03779493 0.03383209\n",
      "|   0.09095947 0.19676776 0.01925627 0.01540721 0.04677161 0.05433749\n",
      "|   0.01273535 0.01604453 0.02718877 0.04785496 0.04703167 0.02565646\n",
      "|   0.02726313 0.01356411 0.17673622 0.01084241 0.01024968 0.02239775\n",
      "|   0.03813412 0.02604754 0.023192   0.09352055 0.02686057 0.00997292\n",
      "|   0.03537464 0.08052384 0.01368217 0.014366   0.03178743 0.01446226\n",
      "|   0.02323432 0.07920136 0.02016016 0.01608375 0.0120501  0.02995271\n",
      "|   0.05394103 0.03003465 0.01996591 0.03699729 0.0255818  0.04365746\n",
      "|   0.01982757 0.04235343 0.04885878 0.03152624 0.0164482  0.0290758\n",
      "|   0.04606192 0.02331102 0.0384533  0.04088598]\n",
      "| eta (Dirichlet prior on the per-topic word distribution)\n",
      "|  0.01\n",
      "|\n",
      "<Topics>\n",
      "| #0 (2986) : inhibition response_inhibition inhibitory_control stop impulsivity\n",
      "| #1 (1184) : pd parkinson_disease pd_patient motor dbs\n",
      "| #2 (1677) : motivation action effort reward cpt\n",
      "| #3 (2314) : auditory visual perception sound sensory\n",
      "| #4 (999) : r number inhibition s lie\n",
      "| #5 (3681) : model base analysis different datum\n",
      "| #6 (2372) : reward decision_making decision choice behavior\n",
      "| #7 (1073) : injury tbi head traumatic_brain moral\n",
      "| #8 (2837) : age old old_adult adult young\n",
      "| #9 (7538) : use provide method tool datum\n",
      "| #10 (3016) : neuron cell spike channel activity\n",
      "| #11 (3482) : reaction_time trial accuracy slow fast\n",
      "| #12 (2484) : lesion system nucleus dorsal pathway\n",
      "| #13 (4177) : disease disorder onset clinical syndrome\n",
      "| #14 (5125) : experiment model process information account\n",
      "| #15 (1643) : surgery case cerebral surgical postoperative\n",
      "| #16 (2470) : emotional emotion affective negative face\n",
      "| #17 (2939) : adhd hyperactivity attention_deficit disorder symptom\n",
      "| #18 (3154) : brain volume mri structural lesion\n",
      "| #19 (856) : smoker nicotine smoking smoke craving\n",
      "| #20 (5716) : p score subject assess scale\n",
      "| #21 (1829) : mrna brain expression gut acid\n",
      "| #22 (4467) : rat administration dose drug day\n",
      "| #23 (12672) : function deficit executive executive_function impairment\n",
      "| #24 (3082) : adolescent adult early development age\n",
      "| #25 (2214) : attentional information threat bias processing\n",
      "| #26 (1315) : sleep pain ds chronic fatigue\n",
      "| #27 (3506) : schizophrenia symptom deficit psychosis patient_schizophrenia\n",
      "| #28 (915) : epilepsy seizure hd sign tle\n",
      "| #29 (1615) : mutation gene variant genetic protein\n",
      "| #30 (1509) : use user cocaine drug cannabis\n",
      "| #31 (1295) : white_matter fa integrity tract cp\n",
      "| #32 (1200) : n hc compare non es\n",
      "| #33 (4691) : high low individual level state\n",
      "| #34 (1280) : food weight obesity bmi eat\n",
      "| #35 (1365) : d c b intensity acute\n",
      "| #36 (2645) : depression symptom mood depressive_symptom depressive\n",
      "| #37 (3946) : social self people experience support\n",
      "| #38 (1357) : tumor glioma treatment brain_tumor brain\n",
      "| #39 (5041) : mouse receptor induce protein dopamine\n",
      "| #40 (1395) : mindfulness meditation thought experience self\n",
      "| #41 (1655) : stress psychological cope caregiver health\n",
      "| #42 (1855) : condition dual_task walk gait experiment\n",
      "| #43 (1269) : asd disorder autism td autistic\n",
      "| #44 (1122) : speech suicide vwm noise suicidal\n",
      "| #45 (37001) : finding suggest present find different\n",
      "| #46 (2553) : motor movement hand action execution\n",
      "| #47 (3329) : network connectivity functional brain functional_connectivity\n",
      "| #48 (4707) : signal activity process event temporal\n",
      "| #49 (11143) : research review approach provide process\n",
      "| #50 (1386) : reading read processing sentence word\n",
      "| #51 (881) : ms sequence category implicit explicit\n",
      "| #52 (4167) : ef parent behavior problem family\n",
      "| #53 (4081) : target cue location spatial attentional\n",
      "| #54 (697) : figure bias ability matrix complex\n",
      "| #55 (1464) : infant maternal mother birth prenatal\n",
      "| #56 (2808) : ad dementia mci alzheimer_disease impairment\n",
      "| #57 (2911) : object visual face novel processing\n",
      "| #58 (2942) : recall retrieval episodic_memory item word\n",
      "| #59 (1782) : conflict error trial monitoring congruent\n",
      "| #60 (1890) : word interference stroop color present\n",
      "| #61 (1189) : capacity functional cbt al domain\n",
      "| #62 (10154) : year association score predict low\n",
      "| #63 (1339) : genotype association polymorphism genetic gene\n",
      "| #64 (766) : mdd apathy fc cortisol axis\n",
      "| #65 (1730) : level plasma csf blood β\n",
      "| #66 (2108) : learning learn feedback skill new\n",
      "| #67 (2546) : eeg activity power theta alpha\n",
      "| #68 (1367) : context cost switch cue repeat\n",
      "| #69 (7060) : treatment intervention week improve month\n",
      "| #70 (1886) : pfc activation dlpfc activity prefrontal_cortex\n",
      "| #71 (750) : creativity creative pa thinking clozapine\n",
      "| #72 (2488) : risk ci hiv prevalence use\n",
      "| #73 (7708) : activation region cortex right area\n",
      "| #74 (1017) : aggression social aggressive offender li\n",
      "| #75 (1312) : alcohol drinking alcohol_use drink substance_use\n",
      "| #76 (2292) : right left orientation body space\n",
      "| #77 (1210) : rat da se l animal\n",
      "| #78 (1888) : language word semantic bilingual fluency\n",
      "| #79 (3829) : time delay duration interval day\n",
      "| #80 (1177) : male female sex sexual woman\n",
      "| #81 (1750) : induce aβ amyloid expression hippocampal\n",
      "| #82 (906) : level t woman hormone testosterone\n",
      "| #83 (1742) : shift flexibility ability cognitive_flexibility inhibition\n",
      "| #84 (4638) : care need service health mental_health\n",
      "| #85 (2135) : student school grade academic knowledge\n",
      "| #86 (1847) : saccade target eye eye_movement gaze\n",
      "| #87 (3539) : cell brain gene neuronal function\n",
      "| #88 (1774) : stroke disorder ocd symptom cluster\n",
      "| #89 (2929) : factor scale assess questionnaire validity\n",
      "| #90 (1279) : target search item distractor visual_search\n",
      "| #91 (3367) : amplitude erp potential event_relate component\n",
      "| #92 (2958) : work_memory wm load information capacity\n",
      "| #93 (2388) : training exercise session improvement train\n",
      "| #94 (1379) : stimulation tdcs site nerve tms\n",
      "| #95 (1722) : ii score iii sensitivity sample\n",
      "| #96 (2905) : study identify include psychosocial review\n",
      "| #97 (1866) : anxiety ptsd mood trauma negative\n",
      "| #98 (2002) : strategy problem problem_solve skill reasoning\n",
      "| #99 (2444) : area cortical pattern cortex spatial\n",
      "|\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "mdl = tp.LDAModel(k=100, min_cf=10, min_df=5, rm_top=20, corpus=corpus)\n",
    "# mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=20, k=30, corpus=corpus)\n",
    "# mdl.num_beta_sample = 5\n",
    "# mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)\n",
    "# mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10)\n",
    "\n",
    "mdl.train(0)\n",
    "\n",
    "print('Removed top words:', mdl.removed_top_words)\n",
    "\n",
    "\n",
    "for i in tqdm(range(0, 1000, 20), 'training'):\n",
    "  mdl.train(20)\n",
    "\n",
    "mdl.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 101/101 [04:01<00:00,  2.40s/it]\n",
      "2021-10-14 11:12:34,988 - BERTopic - Transformed documents to Embeddings\n",
      "2021-10-14 11:12:45,497 - BERTopic - Reduced dimensionality with UMAP\n",
      "2021-10-14 11:12:46,054 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Topic</th>\n",
       "      <th>Count</th>\n",
       "      <th>Name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1</td>\n",
       "      <td>1075</td>\n",
       "      <td>-1_task_memory_attention_cognitive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>140</td>\n",
       "      <td>0_child_age_work memory_cognitive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>138</td>\n",
       "      <td>1_adhd_attention deficit hyperactivity_deficit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>119</td>\n",
       "      <td>2_schizophrenia_cognitive_symptom_patient schi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>87</td>\n",
       "      <td>3_neuron_rat_dopamine_inhibition</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>58</td>\n",
       "      <td>12</td>\n",
       "      <td>58_schizophrenia_receptor_nmdar_nmda</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>59</td>\n",
       "      <td>11</td>\n",
       "      <td>59_inhibition_stop signal_motor inhibition_res...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>60</td>\n",
       "      <td>11</td>\n",
       "      <td>60_categorical perception_categorical_stimulus...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>61</td>\n",
       "      <td>11</td>\n",
       "      <td>61_bilingual_bilingualism_monolingual_bilingua...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>62</td>\n",
       "      <td>10</td>\n",
       "      <td>62_reward_stimulus_prosocial_attention</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>64 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Topic  Count                                               Name\n",
       "0      -1   1075                 -1_task_memory_attention_cognitive\n",
       "1       0    140                  0_child_age_work memory_cognitive\n",
       "2       1    138  1_adhd_attention deficit hyperactivity_deficit...\n",
       "3       2    119  2_schizophrenia_cognitive_symptom_patient schi...\n",
       "4       3     87                   3_neuron_rat_dopamine_inhibition\n",
       "..    ...    ...                                                ...\n",
       "59     58     12               58_schizophrenia_receptor_nmdar_nmda\n",
       "60     59     11  59_inhibition_stop signal_motor inhibition_res...\n",
       "61     60     11  60_categorical perception_categorical_stimulus...\n",
       "62     61     11  61_bilingual_bilingualism_monolingual_bilingua...\n",
       "63     62     10             62_reward_stimulus_prosocial_attention\n",
       "\n",
       "[64 rows x 3 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "!pip install bertopic -Uq\n",
    "\n",
    "import pandas as pd\n",
    "from bertopic import BERTopic\n",
    "\n",
    "DATA_FRACTION = .2\n",
    "PUBMED = pd.read_csv('../cogtext/data/pubmed_abstracts.csv.gz')\n",
    "PUBMED['label'] = PUBMED['subcategory']\n",
    "PUBMED = PUBMED.groupby('label').sample(frac=DATA_FRACTION)\n",
    "PUBMED = PUBMED.dropna(subset=['abstract'])\n",
    "\n",
    "model = BERTopic(verbose=True, n_gram_range=(1,3), calculate_probabilities=True)\n",
    "topics, scores = model.fit_transform(PUBMED['abstracts'])\n",
    "model.get_topic_info()"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "266722041ed6426a0a88c0d75e9dd39659f44e3a6fea07300cd13bea36eb387d"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('py3': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}