Skip to content

Commit

Permalink
IA-3793 validate xlsform question names uniquess
Browse files Browse the repository at this point in the history
  • Loading branch information
mestachs committed Dec 18, 2024
1 parent e1d0797 commit da00c3e
Show file tree
Hide file tree
Showing 6 changed files with 1,307 additions and 1 deletion.
9 changes: 8 additions & 1 deletion iaso/api/form_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from rest_framework.fields import Field

from iaso.models import Form, FormVersion, FeatureFlag, Project
from iaso.odk import parsing
from iaso.odk import parsing, validate_xls_form
from iaso.api.query_params import APP_ID
from .common import ModelViewSet, TimestampField, DynamicFieldsModelSerializer
from .forms import HasFormPermission
Expand Down Expand Up @@ -99,6 +99,13 @@ def validate(self, data: typing.MutableMapping):
# if update skip the rest of check
return data

validation_errors = validate_xls_form(data["xls_file"])

if len(validation_errors):
raise serializers.ValidationError(
{"xls_file": "Form id is invalid.", "xls_file_validation_errors": validation_errors}
)

# handle xls to xml conversion
try:
previous_form_version = FormVersion.objects.latest_version(form)
Expand Down
1 change: 1 addition & 0 deletions iaso/odk/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .parsing import parse_xls_form, Survey, ParsingError, to_questions_by_name
from .validator import validate_xls_form
167 changes: 167 additions & 0 deletions iaso/odk/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import pandas as pd
import re


#
# Example usage
# original_expression = "${a} + ${b} + ${a}"
# print(get_formula_dependencies(original_expression)) => a b
def get_formula_dependencies(original_expression):
dependencies = []
expression = str(original_expression) # Ensure the expression is a string

# Regex to find all expressions like ${variable}
regex = r"\${(.*?)}"
variables_found = re.findall(regex, expression)

if not variables_found:
return dependencies

# Add each found variable to dependencies
for var in variables_found:
dependencies.append(var)

# Return unique records, but keep the order of insertion
return list(dict.fromkeys(dependencies))


def parse_sheet(excel_file, sheet_name):
if sheet_name not in excel_file.sheet_names:
return []

excel_data = excel_file.parse(sheet_name=sheet_name, keep_default_na=False)
excel_data = excel_data.reset_index()
excel_data.rename(columns={"index": "line_number"}, inplace=True)
rows = excel_data.to_dict(orient="records")

return rows


def is_end_group(question):
return question.get("type") == "end group" or question.get("type") == "end_group"


def is_end_repeat(question):
return question.get("type") == "end repeat" or question.get("type") == "end_repeat"


def is_select_one(question):
return question.get("type") and (
question.get("type").startswith("select_one ") or question.get("type").startswith("select one ")
)


def is_repeat_group(question):
return question.get("type") == "begin repeat" or question.get("type") == "begin_repeat"


def get_list_name(question):
q_type = question.get("type")
if "select_one" in q_type:
return q_type.split("select_one ")[1]
if "select one" in q_type:
return q_type.split("select one ")[1]
return None


def index_by_field(collection, field_name):
collection_by_name = {}

for q in collection:
if q.get(field_name):
collection_by_name.setdefault(q[field_name], []).append(q)

return collection_by_name


def validate_xls_form(xls_file):
excel_file = pd.ExcelFile(xls_file, engine="openpyxl")

question_rows = parse_sheet(excel_file, "survey")

choices_rows = parse_sheet(excel_file, "choices")

questions = [q for q in question_rows if q.get("type") and not is_end_group(q) and not is_end_repeat(q)]
choices = [c for c in choices_rows if c.get("list_name")]

validation_errors = []

questions_by_name = index_by_field(questions, "name")

choices_by_list_name = index_by_field(choices, "list_name")

has_some_repeat_groups = len([q for q in question_rows if is_repeat_group(q)]) > 0

for name, questions_list in questions_by_name.items():
if len(questions_list) > 1 and not has_some_repeat_groups:
# TODO : don't block if questions are in repeat groups / only if same repeat_groups
# see odk_instance_repeat_group_form.xlsx
# if too hard only do this validation for the moment it has no repeat_groups ?
validation_errors.append(
{
"message": "duplicated question name '" + name + "'",
"question": questions_list[0],
"questions_list": questions_list,
"sheet": "survey",
"severity": "error",
}
)
if " " in name:
validation_errors.append(
{
"message": "avoid blanks in question name '" + name + "'",
"question": questions_list[0],
"sheet": "survey",
"severity": "error",
}
)

select_one_questions = [q for q in questions if is_select_one(q)]

for select_one_q in select_one_questions:
list_name = get_list_name(select_one_q)
if list_name not in choices_by_list_name:
validation_errors.append(
{
"message": "choices not for '" + select_one_q["name"] + "' and list_name '" + list_name + "'",
"question": questions_list[0],
"sheet": "survey",
"severity": "error",
}
)

formula_columns = ["relevant", "calculation", "constraint", "required", "default", "trigger"]

for q in questions:
for col in formula_columns:
formula = q.get(col)
if formula:
dependencies = get_formula_dependencies(formula)
for dependency in dependencies:
if dependency not in questions_by_name:
validation_errors.append(
{
"message": f"unknown reference to '{dependency}' in '{q['name']}' {col} = '{formula}'",
"question": q,
"sheet": "survey",
"severity": "error",
}
)

# TODO more advanced validations
# - choices
# - name : no blanks
# - calculate should have a calculation
# - formulas : see "relevant","calculation", "constraint", "required", "default", "trigger" columns
# - [x] formula's should used "known" question names (migth be done with string tokenizer)
# - [ ] formula's should used "known" functions (harder too, need ast to be acurate)
# - [ ] formula's should be valid (harder, need ast)
# - notes should contain valid reference to question's name
# - non matching "begin group" "end group"
# - non matching "begin repeat group" "end repeat group"
# - non matching nested groups or repeat groups
# - limit types to known/supported one ?
# - select_multiple verify list_name
# - select_*_from_file ??

return validation_errors
Binary file added iaso/tests/fixtures/odk_invalid_xlsform.xlsx
Binary file not shown.
Loading

0 comments on commit da00c3e

Please sign in to comment.