Newer
Older
notebooks / text_embeddings_pg.ipynb
Morteza Ansarinia on 25 Oct 2021 10 KB some text analysis
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.90kB/s]\n",
      "Downloading: 100%|██████████| 483/483 [00:00<00:00, 239kB/s]\n",
      "Downloading: 100%|██████████| 226k/226k [00:00<00:00, 329kB/s]\n",
      "Downloading: 100%|██████████| 455k/455k [00:01<00:00, 358kB/s]\n",
      "Downloading: 100%|██████████| 256M/256M [01:28<00:00, 3.02MB/s]\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
    "from transformers import TFDistilBertModel, DistilBertConfig\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DistilBertTokenizerFast\n",
    "\n",
    "# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime\n",
    "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')\n",
    "\n",
    "# Define the maximum number of words to tokenize (DistilBERT can tokenize up to 512)\n",
    "MAX_LENGTH = 512\n",
    "BATCH_SIZE = 512\n",
    "\n",
    "# Define function to encode text data in batches\n",
    "def batch_encode(tokenizer, texts, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):\n",
    "    \"\"\"\"\"\"\"\"\"\n",
    "    A function that encodes a batch of texts and returns the texts'\n",
    "    corresponding encodings and attention masks that are ready to be fed \n",
    "    into a pre-trained transformer model.\n",
    "    \n",
    "    Input:\n",
    "        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class\n",
    "        - texts:       List of strings where each string represents a text\n",
    "        - batch_size:  Integer controlling number of texts in a batch\n",
    "        - max_length:  Integer controlling max number of words to tokenize in a given text\n",
    "    Output:\n",
    "        - input_ids:       sequence of texts encoded as a tf.Tensor object\n",
    "        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object\n",
    "    \"\"\"\"\"\"\"\"\"\n",
    "    \n",
    "    input_ids = []\n",
    "    attention_mask = []\n",
    "    \n",
    "    for i in range(0, len(texts), batch_size):\n",
    "        batch = texts[i:i+batch_size]\n",
    "        inputs = tokenizer.batch_encode_plus(batch,\n",
    "                                             max_length=max_length,\n",
    "                                             padding='longest', #implements dynamic padding\n",
    "                                             truncation=True,\n",
    "                                             return_attention_mask=True,\n",
    "                                             return_token_type_ids=False\n",
    "                                             )\n",
    "        input_ids.extend(inputs['input_ids'])\n",
    "        attention_mask.extend(inputs['attention_mask'])\n",
    "    \n",
    "    \n",
    "    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)\n",
    "    \n",
    "    \n",
    "# Encode X_train\n",
    "X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())\n",
    "\n",
    "# Encode X_valid\n",
    "X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())\n",
    "\n",
    "# Encode X_test\n",
    "X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TFDistilBertModel, DistilBertConfig\n",
    "\n",
    "DISTILBERT_DROPOUT = 0.2\n",
    "DISTILBERT_ATT_DROPOUT = 0.2\n",
    " \n",
    "# Configure DistilBERT's initialization\n",
    "config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, \n",
    "                          attention_dropout=DISTILBERT_ATT_DROPOUT, \n",
    "                          output_hidden_states=True)\n",
    "                          \n",
    "# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states \n",
    "# and without any specific head on top.\n",
    "distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)\n",
    "\n",
    "# Make DistilBERT layers untrainable\n",
    "for layer in distilBERT.layers:\n",
    "    layer.trainable = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LENGTH = 128\n",
    "LAYER_DROPOUT = 0.2\n",
    "LEARNING_RATE = 5e-5\n",
    "RANDOM_STATE = 42\n",
    "\n",
    "def build_model(transformer, max_length=MAX_LENGTH):\n",
    "    \"\"\"\"\"\"\"\"\"\n",
    "    Template for building a model off of the BERT or DistilBERT architecture\n",
    "    for a binary classification task.\n",
    "    \n",
    "    Input:\n",
    "      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)\n",
    "                      with no added classification head attached.\n",
    "      - max_length:   integer controlling the maximum number of encoded tokens \n",
    "                      in a given sequence.\n",
    "    \n",
    "    Output:\n",
    "      - model:        a compiled tf.keras.Model with added classification layers \n",
    "                      on top of the base pre-trained model architecture.\n",
    "    \"\"\"\"\"\"\"\"\"\"\n",
    "    \n",
    "    # Define weight initializer with a random seed to ensure reproducibility\n",
    "    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) \n",
    "    \n",
    "    # Define input layers\n",
    "    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), \n",
    "                                            name='input_ids', \n",
    "                                            dtype='int32')\n",
    "    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), \n",
    "                                                  name='input_attention', \n",
    "                                                  dtype='int32')\n",
    "    \n",
    "    # DistilBERT outputs a tuple where the first element at index 0\n",
    "    # represents the hidden-state at the output of the model's last layer.\n",
    "    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).\n",
    "    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]\n",
    "    \n",
    "    # We only care about DistilBERT's output for the [CLS] token, \n",
    "    # which is located at index 0 of every encoded sequence.  \n",
    "    # Splicing out the [CLS] tokens gives us 2D data.\n",
    "    cls_token = last_hidden_state[:, 0, :]\n",
    "    \n",
    "    ##                                                 ##\n",
    "    ## Define additional dropout and dense layers here ##\n",
    "    ##                                                 ##\n",
    "    \n",
    "    # Define a single node that makes up the output layer (for binary classification)\n",
    "    output = tf.keras.layers.Dense(1, \n",
    "                                   activation='sigmoid',\n",
    "                                   kernel_initializer=weight_initializer,  \n",
    "                                   kernel_constraint=None,\n",
    "                                   bias_initializer='zeros'\n",
    "                                   )(cls_token)\n",
    "    \n",
    "    # Define the model\n",
    "    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)\n",
    "    \n",
    "    # Compile the model\n",
    "    model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), \n",
    "                  loss=focal_loss(),\n",
    "                  metrics=['accuracy'])\n",
    "    \n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EPOCHS = 6\n",
    "BATCH_SIZE = 64\n",
    "NUM_STEPS = len(X_train.index) // BATCH_SIZE\n",
    "\n",
    "# Train the model\n",
    "train_history1 = model.fit(\n",
    "    x = [X_train_ids, X_train_attention],\n",
    "    y = y_train.to_numpy(),\n",
    "    epochs = EPOCHS,\n",
    "    batch_size = BATCH_SIZE,\n",
    "    steps_per_epoch = NUM_STEPS,\n",
    "    validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),\n",
    "    verbose=2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FT_EPOCHS = 4\n",
    "BATCH_SIZE = 64\n",
    "NUM_STEPS = len(X_train.index)\n",
    "\n",
    "# Unfreeze distilBERT layers and make available for training\n",
    "for layer in distilBERT.layers:\n",
    "    layer.trainable = True\n",
    "    \n",
    "# Recompile model after unfreezing\n",
    "model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), \n",
    "              loss=focal_loss(),\n",
    "              metrics=['accuracy'])\n",
    "\n",
    "# Train the model\n",
    "train_history2 = model.fit(\n",
    "    x = [X_train_ids, X_train_attention],\n",
    "    y = y_train.to_numpy(),\n",
    "    epochs = FT_EPOCHS,\n",
    "    batch_size = BATCH_SIZE,\n",
    "    steps_per_epoch = NUM_STEPS,\n",
    "    validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),\n",
    "    verbose=2\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "4d4c55ad0dd25f9ca95e4d49a929aa3f71bfb37020ae570a9996c3e164818202"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('py3': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}