{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.90kB/s]\n", "Downloading: 100%|██████████| 483/483 [00:00<00:00, 239kB/s]\n", "Downloading: 100%|██████████| 226k/226k [00:00<00:00, 329kB/s]\n", "Downloading: 100%|██████████| 455k/455k [00:01<00:00, 358kB/s]\n", "Downloading: 100%|██████████| 256M/256M [01:28<00:00, 3.02MB/s]\n" ] } ], "source": [ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", "from transformers import TFDistilBertModel, DistilBertConfig\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n", "model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DistilBertTokenizerFast\n", "\n", "# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime\n", "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')\n", "\n", "# Define the maximum number of words to tokenize (DistilBERT can tokenize up to 512)\n", "MAX_LENGTH = 512\n", "BATCH_SIZE = 512\n", "\n", "# Define function to encode text data in batches\n", "def batch_encode(tokenizer, texts, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):\n", " \"\"\"\"\"\"\"\"\"\n", " A function that encodes a batch of texts and returns the texts'\n", " corresponding encodings and attention masks that are ready to be fed \n", " into a pre-trained transformer model.\n", " \n", " Input:\n", " - tokenizer: Tokenizer object from the PreTrainedTokenizer Class\n", " - texts: List of strings where each string represents a text\n", " - batch_size: Integer controlling number of texts in a batch\n", " - max_length: Integer controlling max number of words to tokenize in a given text\n", " Output:\n", " - input_ids: sequence of texts encoded as a tf.Tensor object\n", " - attention_mask: the texts' attention mask encoded as a tf.Tensor object\n", " \"\"\"\"\"\"\"\"\"\n", " \n", " input_ids = []\n", " attention_mask = []\n", " \n", " for i in range(0, len(texts), batch_size):\n", " batch = texts[i:i+batch_size]\n", " inputs = tokenizer.batch_encode_plus(batch,\n", " max_length=max_length,\n", " padding='longest', #implements dynamic padding\n", " truncation=True,\n", " return_attention_mask=True,\n", " return_token_type_ids=False\n", " )\n", " input_ids.extend(inputs['input_ids'])\n", " attention_mask.extend(inputs['attention_mask'])\n", " \n", " \n", " return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)\n", " \n", " \n", "# Encode X_train\n", "X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())\n", "\n", "# Encode X_valid\n", "X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())\n", "\n", "# Encode X_test\n", "X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import TFDistilBertModel, DistilBertConfig\n", "\n", "DISTILBERT_DROPOUT = 0.2\n", "DISTILBERT_ATT_DROPOUT = 0.2\n", " \n", "# Configure DistilBERT's initialization\n", "config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, \n", " attention_dropout=DISTILBERT_ATT_DROPOUT, \n", " output_hidden_states=True)\n", " \n", "# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states \n", "# and without any specific head on top.\n", "distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)\n", "\n", "# Make DistilBERT layers untrainable\n", "for layer in distilBERT.layers:\n", " layer.trainable = False" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MAX_LENGTH = 128\n", "LAYER_DROPOUT = 0.2\n", "LEARNING_RATE = 5e-5\n", "RANDOM_STATE = 42\n", "\n", "def build_model(transformer, max_length=MAX_LENGTH):\n", " \"\"\"\"\"\"\"\"\"\n", " Template for building a model off of the BERT or DistilBERT architecture\n", " for a binary classification task.\n", " \n", " Input:\n", " - transformer: a base Hugging Face transformer model object (BERT or DistilBERT)\n", " with no added classification head attached.\n", " - max_length: integer controlling the maximum number of encoded tokens \n", " in a given sequence.\n", " \n", " Output:\n", " - model: a compiled tf.keras.Model with added classification layers \n", " on top of the base pre-trained model architecture.\n", " \"\"\"\"\"\"\"\"\"\"\n", " \n", " # Define weight initializer with a random seed to ensure reproducibility\n", " weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) \n", " \n", " # Define input layers\n", " input_ids_layer = tf.keras.layers.Input(shape=(max_length,), \n", " name='input_ids', \n", " dtype='int32')\n", " input_attention_layer = tf.keras.layers.Input(shape=(max_length,), \n", " name='input_attention', \n", " dtype='int32')\n", " \n", " # DistilBERT outputs a tuple where the first element at index 0\n", " # represents the hidden-state at the output of the model's last layer.\n", " # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).\n", " last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]\n", " \n", " # We only care about DistilBERT's output for the [CLS] token, \n", " # which is located at index 0 of every encoded sequence. \n", " # Splicing out the [CLS] tokens gives us 2D data.\n", " cls_token = last_hidden_state[:, 0, :]\n", " \n", " ## ##\n", " ## Define additional dropout and dense layers here ##\n", " ## ##\n", " \n", " # Define a single node that makes up the output layer (for binary classification)\n", " output = tf.keras.layers.Dense(1, \n", " activation='sigmoid',\n", " kernel_initializer=weight_initializer, \n", " kernel_constraint=None,\n", " bias_initializer='zeros'\n", " )(cls_token)\n", " \n", " # Define the model\n", " model = tf.keras.Model([input_ids_layer, input_attention_layer], output)\n", " \n", " # Compile the model\n", " model.compile(tf.keras.optimizers.Adam(lr=LEARNING_RATE), \n", " loss=focal_loss(),\n", " metrics=['accuracy'])\n", " \n", " return model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "EPOCHS = 6\n", "BATCH_SIZE = 64\n", "NUM_STEPS = len(X_train.index) // BATCH_SIZE\n", "\n", "# Train the model\n", "train_history1 = model.fit(\n", " x = [X_train_ids, X_train_attention],\n", " y = y_train.to_numpy(),\n", " epochs = EPOCHS,\n", " batch_size = BATCH_SIZE,\n", " steps_per_epoch = NUM_STEPS,\n", " validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),\n", " verbose=2\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "FT_EPOCHS = 4\n", "BATCH_SIZE = 64\n", "NUM_STEPS = len(X_train.index)\n", "\n", "# Unfreeze distilBERT layers and make available for training\n", "for layer in distilBERT.layers:\n", " layer.trainable = True\n", " \n", "# Recompile model after unfreezing\n", "model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), \n", " loss=focal_loss(),\n", " metrics=['accuracy'])\n", "\n", "# Train the model\n", "train_history2 = model.fit(\n", " x = [X_train_ids, X_train_attention],\n", " y = y_train.to_numpy(),\n", " epochs = FT_EPOCHS,\n", " batch_size = BATCH_SIZE,\n", " steps_per_epoch = NUM_STEPS,\n", " validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),\n", " verbose=2\n", ")" ] } ], "metadata": { "interpreter": { "hash": "4d4c55ad0dd25f9ca95e4d49a929aa3f71bfb37020ae570a9996c3e164818202" }, "kernelspec": { "display_name": "Python 3.9.7 64-bit ('py3': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }