diff --git a/Dockerfile b/Dockerfile index f2f87c9..7d929d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,8 @@ COPY api/api.py /app COPY api/pyproject.toml /app COPY api/poetry.lock /app COPY dataset/${dataset} /app +COPY training/sql_tokenizer.py /app/ +COPY training/sql_tokenizer_vocab.json /app/ COPY sqli_model/ /app/sqli_model/ RUN pip install --disable-pip-version-check poetry RUN poetry install --no-root diff --git a/api/api.py b/api/api.py index 3feb28e..8f57dbc 100644 --- a/api/api.py +++ b/api/api.py @@ -1,9 +1,8 @@ from flask import Flask, jsonify, request import tensorflow as tf -from tensorflow.keras.preprocessing.sequence import pad_sequences -from tensorflow.keras.preprocessing.text import Tokenizer import pandas as pd import os +from sql_tokenizer import SQLTokenizer # Import SQLTokenizer app = Flask(__name__) @@ -12,11 +11,11 @@ MAX_LEN = 100 DATASET_PATH = os.getenv("DATASET_PATH", "dataset/sqli_dataset1.csv") MODEL_PATH = os.getenv("MODEL_PATH", "/app/sqli_model/3/") -DATASET = pd.read_csv(DATASET_PATH) -# Tokenizer setup -TOKENIZER = Tokenizer(num_words=MAX_WORDS, filters="") -TOKENIZER.fit_on_texts(DATASET["Query"]) +# Load dataset and initialize SQLTokenizer +DATASET = pd.read_csv(DATASET_PATH) +sql_tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN) +sql_tokenizer.fit_on_texts(DATASET["Query"]) # Fit tokenizer on dataset # Load the model using tf.saved_model.load and get the serving signature loaded_model = tf.saved_model.load(MODEL_PATH) @@ -26,9 +25,8 @@ def warm_up_model(): """Sends a dummy request to the model to 'warm it up'.""" dummy_query = "SELECT * FROM users WHERE id = 1" - query_seq = TOKENIZER.texts_to_sequences([dummy_query]) - query_vec = pad_sequences(query_seq, maxlen=MAX_LEN) - input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32) + query_seq = sql_tokenizer.texts_to_sequences([dummy_query]) + input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32) _ = model_predict(input_tensor) # Make a dummy prediction to initialize the model print("Model warmed up and ready to serve requests.") @@ -39,27 +37,26 @@ def predict(): return jsonify({"error": "No query provided"}), 400 try: - # Tokenize and pad the input query + # Tokenize and pad the input query using SQLTokenizer query = request.json["query"] - query_seq = TOKENIZER.texts_to_sequences([query]) - query_vec = pad_sequences(query_seq, maxlen=MAX_LEN) - - # Convert input to tensor - input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32) + query_seq = sql_tokenizer.texts_to_sequences([query]) + input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32) # Use the loaded model's serving signature to make the prediction prediction = model_predict(input_tensor) + # Check for valid output and extract the result if "output_0" not in prediction or prediction["output_0"].get_shape() != [1, 1]: return jsonify({"error": "Invalid model output"}), 500 + # Extract confidence and return the response return jsonify( { "confidence": float("%.4f" % prediction["output_0"].numpy()[0][0]), } ) except Exception as e: - # TODO: Log the error and return a proper error message + # Log the error and return a proper error message return jsonify({"error": str(e)}), 500 diff --git a/sqli_model/3/fingerprint.pb b/sqli_model/3/fingerprint.pb index 0a5767e..606cf5b 100644 --- a/sqli_model/3/fingerprint.pb +++ b/sqli_model/3/fingerprint.pb @@ -1 +1 @@ -=ʒ月 Ϗ(2 \ No newline at end of file +ℨ־鿶月 Ϗ(2 \ No newline at end of file diff --git a/sqli_model/3/saved_model.pb b/sqli_model/3/saved_model.pb index e199558..8d7e174 100644 Binary files a/sqli_model/3/saved_model.pb and b/sqli_model/3/saved_model.pb differ diff --git a/sqli_model/3/variables/variables.data-00000-of-00001 b/sqli_model/3/variables/variables.data-00000-of-00001 index de702e4..b21ae7a 100644 Binary files a/sqli_model/3/variables/variables.data-00000-of-00001 and b/sqli_model/3/variables/variables.data-00000-of-00001 differ diff --git a/sqli_model/3/variables/variables.index b/sqli_model/3/variables/variables.index index 4ee74a8..d8e86d1 100644 Binary files a/sqli_model/3/variables/variables.index and b/sqli_model/3/variables/variables.index differ diff --git a/training/sql_tokenizer.py b/training/sql_tokenizer.py new file mode 100644 index 0000000..7c4a5e7 --- /dev/null +++ b/training/sql_tokenizer.py @@ -0,0 +1,44 @@ +# sql_tokenizer.py +import re +import json +from tensorflow.keras.preprocessing.sequence import pad_sequences + + +class SQLTokenizer: + def __init__(self, max_words=10000, max_len=100): + self.max_words = max_words + self.max_len = max_len + self.token_index = {} + + def tokenize(self, query): + # Define a regex pattern for SQL tokens (operators, punctuation, keywords) + pattern = r"[\w']+|[=>") + """Tokenize and pad text data using SQLTokenizer.""" + tokenizer = SQLTokenizer(max_words=max_words) tokenizer.fit_on_texts(data["Query"]) sequences = tokenizer.texts_to_sequences(data["Query"]) - return pad_sequences(sequences, maxlen=max_len), tokenizer + return sequences, tokenizer def build_model(input_dim, output_dim=128): @@ -97,10 +96,9 @@ def plot_history(history): if __name__ == "__main__": if len(sys.argv) != 3: - print("Usage: python train.py ") + print("Usage: python train_v3.py ") sys.exit(1) - # Constants MAX_WORDS = 10000 MAX_LEN = 100 EPOCHS = 50 @@ -108,8 +106,8 @@ def plot_history(history): # Load and preprocess data data = load_data(sys.argv[1]) - X, tokenizer = preprocess_text(data) - y = data["Label"].values # Convert to NumPy array to avoid KeyError in KFold + X, tokenizer = preprocess_text(data, max_words=MAX_WORDS) + y = data["Label"].values # Convert to NumPy array for compatibility with KFold # Initialize cross-validation k_folds = 5 @@ -120,7 +118,7 @@ def plot_history(history): print(f"Training fold {fold}/{k_folds}") # Split the data - X_train, X_val = X[train_idx], X[val_idx] + X_train, X_val = np.array(X)[train_idx], np.array(X)[val_idx] y_train, y_val = y[train_idx], y[val_idx] # Compute class weights to handle imbalance @@ -130,7 +128,7 @@ def plot_history(history): class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))} # Build and train the model - model = build_model(input_dim=len(tokenizer.word_index) + 1) + model = build_model(input_dim=len(tokenizer.token_index) + 1) early_stopping = EarlyStopping( monitor="val_loss", patience=5, restore_best_weights=True ) diff --git a/training/training_history.png b/training/training_history.png index a016308..8d286b6 100644 Binary files a/training/training_history.png and b/training/training_history.png differ