Skip to content

Commit

Permalink
Use a custom SQL tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafa committed Oct 28, 2024
1 parent 95f2f41 commit 096859a
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 37 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ COPY api/api.py /app
COPY api/pyproject.toml /app
COPY api/poetry.lock /app
COPY dataset/${dataset} /app
COPY training/sql_tokenizer.py /app/
COPY training/sql_tokenizer_vocab.json /app/
COPY sqli_model/ /app/sqli_model/
RUN pip install --disable-pip-version-check poetry
RUN poetry install --no-root
Expand Down
29 changes: 13 additions & 16 deletions api/api.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from flask import Flask, jsonify, request
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import os
from sql_tokenizer import SQLTokenizer # Import SQLTokenizer

app = Flask(__name__)

Expand All @@ -12,11 +11,11 @@
MAX_LEN = 100
DATASET_PATH = os.getenv("DATASET_PATH", "dataset/sqli_dataset1.csv")
MODEL_PATH = os.getenv("MODEL_PATH", "/app/sqli_model/3/")
DATASET = pd.read_csv(DATASET_PATH)

# Tokenizer setup
TOKENIZER = Tokenizer(num_words=MAX_WORDS, filters="")
TOKENIZER.fit_on_texts(DATASET["Query"])
# Load dataset and initialize SQLTokenizer
DATASET = pd.read_csv(DATASET_PATH)
sql_tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN)
sql_tokenizer.fit_on_texts(DATASET["Query"]) # Fit tokenizer on dataset

# Load the model using tf.saved_model.load and get the serving signature
loaded_model = tf.saved_model.load(MODEL_PATH)
Expand All @@ -26,9 +25,8 @@
def warm_up_model():
"""Sends a dummy request to the model to 'warm it up'."""
dummy_query = "SELECT * FROM users WHERE id = 1"
query_seq = TOKENIZER.texts_to_sequences([dummy_query])
query_vec = pad_sequences(query_seq, maxlen=MAX_LEN)
input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32)
query_seq = sql_tokenizer.texts_to_sequences([dummy_query])
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)
_ = model_predict(input_tensor) # Make a dummy prediction to initialize the model
print("Model warmed up and ready to serve requests.")

Expand All @@ -39,27 +37,26 @@ def predict():
return jsonify({"error": "No query provided"}), 400

try:
# Tokenize and pad the input query
# Tokenize and pad the input query using SQLTokenizer
query = request.json["query"]
query_seq = TOKENIZER.texts_to_sequences([query])
query_vec = pad_sequences(query_seq, maxlen=MAX_LEN)

# Convert input to tensor
input_tensor = tf.convert_to_tensor(query_vec, dtype=tf.float32)
query_seq = sql_tokenizer.texts_to_sequences([query])
input_tensor = tf.convert_to_tensor(query_seq, dtype=tf.float32)

# Use the loaded model's serving signature to make the prediction
prediction = model_predict(input_tensor)

# Check for valid output and extract the result
if "output_0" not in prediction or prediction["output_0"].get_shape() != [1, 1]:
return jsonify({"error": "Invalid model output"}), 500

# Extract confidence and return the response
return jsonify(
{
"confidence": float("%.4f" % prediction["output_0"].numpy()[0][0]),
}
)
except Exception as e:
# TODO: Log the error and return a proper error message
# Log the error and return a proper error message
return jsonify({"error": str(e)}), 500


Expand Down
2 changes: 1 addition & 1 deletion sqli_model/3/fingerprint.pb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
��������=�������ʒ���月�� �����Ϗ�(���������2
���־���������月�� �����Ϗ�(�������2
Binary file modified sqli_model/3/saved_model.pb
Binary file not shown.
Binary file modified sqli_model/3/variables/variables.data-00000-of-00001
Binary file not shown.
Binary file modified sqli_model/3/variables/variables.index
Binary file not shown.
44 changes: 44 additions & 0 deletions training/sql_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# sql_tokenizer.py
import re
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences


class SQLTokenizer:
def __init__(self, max_words=10000, max_len=100):
self.max_words = max_words
self.max_len = max_len
self.token_index = {}

def tokenize(self, query):
# Define a regex pattern for SQL tokens (operators, punctuation, keywords)
pattern = r"[\w']+|[=><!]+|--|/\*|\*/|;|\(|\)|,|\*|\||\s+"
tokens = re.findall(pattern, query.lower())
return tokens

def fit_on_texts(self, queries):
# Build a token index based on the provided queries
all_tokens = set()
for query in queries:
tokens = self.tokenize(query)
all_tokens.update(tokens)
# Limit to max_words
all_tokens = list(all_tokens)[: self.max_words]
self.token_index = {token: i + 1 for i, token in enumerate(all_tokens)}

def texts_to_sequences(self, queries):
# Convert queries to sequences of token IDs
sequences = []
for query in queries:
tokens = self.tokenize(query)
sequence = [self.token_index.get(token, 0) for token in tokens]
sequences.append(sequence)
return pad_sequences(sequences, maxlen=self.max_len)

def save_token_index(self, filepath):
with open(filepath, "w") as f:
json.dump(self.token_index, f)

def load_token_index(self, filepath):
with open(filepath, "r") as f:
self.token_index = json.load(f)
1 change: 1 addition & 0 deletions training/sql_tokenizer_vocab.json

Large diffs are not rendered by default.

39 changes: 30 additions & 9 deletions training/test_train.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
import os
import pandas as pd
import pytest
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TFSMLayer

from sql_tokenizer import SQLTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer # For old tokenizer

MAX_WORDS = 10000
MAX_LEN = 100
TOKENIZER_VOCAB_PATH = "sql_tokenizer_vocab.json" # Path to saved vocabulary

MODELV1 = {
"dataset": "dataset/sqli_dataset1.csv",
"model_path": "sqli_model/1",
"index": 0,
"use_sql_tokenizer": False,
}
MODELV2 = {
"dataset": "dataset/sqli_dataset2.csv",
"model_path": "sqli_model/2",
"index": 1,
"use_sql_tokenizer": False,
}
MODELV3 = {
"dataset": "dataset/sqli_dataset2.csv",
"model_path": "sqli_model/3",
"index": 2,
"use_sql_tokenizer": True,
}


Expand All @@ -46,9 +52,23 @@ def model(request):
model_path = prefix + request.param["model_path"]
sqli_model = TFSMLayer(model_path, call_endpoint="serving_default")

# Tokenizer setup
tokenizer = Tokenizer(num_words=MAX_WORDS, filters="")
tokenizer.fit_on_texts(data["Query"])
# Select the appropriate tokenizer
if request.param["use_sql_tokenizer"]:
# Use SQLTokenizer for MODELV3
tokenizer = SQLTokenizer(max_words=MAX_WORDS, max_len=MAX_LEN)

# Load saved vocabulary if available
if os.path.exists(TOKENIZER_VOCAB_PATH):
tokenizer.load_token_index(TOKENIZER_VOCAB_PATH)
else:
tokenizer.fit_on_texts(data["Query"])
tokenizer.save_token_index(
TOKENIZER_VOCAB_PATH
) # Save for future consistency
else:
# Use the old Keras Tokenizer for MODELV1 and MODELV2
tokenizer = Tokenizer(num_words=MAX_WORDS, filters="")
tokenizer.fit_on_texts(data["Query"])

return {
"tokenizer": tokenizer,
Expand All @@ -60,10 +80,10 @@ def model(request):
@pytest.mark.parametrize(
"sample",
[
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.0022]),
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.0022]),
("select * from users where id=1 or 1=1;", [0.9202, 0.974, 0.3179]),
("select * from users where id='1' or 1=1--", [0.9202, 0.974, 0.3179]),
("select * from users", [0.00077, 0.0015, 0.0231]),
("select * from users where id=10000", [0.1483, 0.8893, 0.0008]),
("select * from users where id=10000", [0.1483, 0.8893, 0.7307]),
("select '1' union select 'a'; -- -'", [0.9999, 0.9732, 0.0139]),
(
"select '' union select 'malicious php code' \\g /var/www/test.php; -- -';",
Expand All @@ -76,7 +96,7 @@ def model(request):
],
)
def test_sqli_model(model, sample):
# Vectorize the sample
# Tokenize and pad the sample using the selected tokenizer
sample_seq = model["tokenizer"].texts_to_sequences([sample[0]])
sample_vec = pad_sequences(sample_seq, maxlen=MAX_LEN)

Expand All @@ -91,4 +111,5 @@ def test_sqli_model(model, sample):
f"Predicted: {predicted_value:.4f}, Expected: {sample[1][model['index']]:.4f}"
)

# Check that prediction matches expected value within tolerance
assert predicted_value == pytest.approx(sample[1][model["index"]], abs=0.05)
20 changes: 9 additions & 11 deletions training/train_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
Bidirectional,
Expand All @@ -21,6 +19,7 @@
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import matplotlib.pyplot as plt
from sql_tokenizer import SQLTokenizer


def load_data(file_path):
Expand All @@ -33,11 +32,11 @@ def load_data(file_path):


def preprocess_text(data, max_words=10000, max_len=100):
"""Tokenize and pad text data."""
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
"""Tokenize and pad text data using SQLTokenizer."""
tokenizer = SQLTokenizer(max_words=max_words)
tokenizer.fit_on_texts(data["Query"])
sequences = tokenizer.texts_to_sequences(data["Query"])
return pad_sequences(sequences, maxlen=max_len), tokenizer
return sequences, tokenizer


def build_model(input_dim, output_dim=128):
Expand Down Expand Up @@ -97,19 +96,18 @@ def plot_history(history):

if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python train.py <input_file> <output_dir>")
print("Usage: python train_v3.py <input_file> <output_dir>")
sys.exit(1)

# Constants
MAX_WORDS = 10000
MAX_LEN = 100
EPOCHS = 50
BATCH_SIZE = 32

# Load and preprocess data
data = load_data(sys.argv[1])
X, tokenizer = preprocess_text(data)
y = data["Label"].values # Convert to NumPy array to avoid KeyError in KFold
X, tokenizer = preprocess_text(data, max_words=MAX_WORDS)
y = data["Label"].values # Convert to NumPy array for compatibility with KFold

# Initialize cross-validation
k_folds = 5
Expand All @@ -120,7 +118,7 @@ def plot_history(history):
print(f"Training fold {fold}/{k_folds}")

# Split the data
X_train, X_val = X[train_idx], X[val_idx]
X_train, X_val = np.array(X)[train_idx], np.array(X)[val_idx]
y_train, y_val = y[train_idx], y[val_idx]

# Compute class weights to handle imbalance
Expand All @@ -130,7 +128,7 @@ def plot_history(history):
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Build and train the model
model = build_model(input_dim=len(tokenizer.word_index) + 1)
model = build_model(input_dim=len(tokenizer.token_index) + 1)
early_stopping = EarlyStopping(
monitor="val_loss", patience=5, restore_best_weights=True
)
Expand Down
Binary file modified training/training_history.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 096859a

Please sign in to comment.