Skip to content

Commit

Permalink
Remove byte-order mark from JSON stream and first CSV chunk (#53)
Browse files Browse the repository at this point in the history
* Remove BOM from JSON stream

When JSON input is a readable stream, a byte-order mark may be present
at the beginning of the stream. When reading the first chunk of the
stream, check for the presence of the byte-order mark. Remove it if it
is present.

* Remove BOM when parsing CSV

When parsing a CSV from a source that includes a byte-order mark (BOM),
the BOM is present at the time the parser attempts to determine if the
first column name is quoted. When the BOM is present, the parser does
not recognize that the column is quoted, resulting in a failure to match
an expected column name. Remove the BOM (if present) from the first
chunk so that a quoted column name will be recognized and parsed as a
quoted value.

* minor refactor to DRY up BOM


---------
  • Loading branch information
mint-thompson authored Aug 20, 2024
1 parent cac349f commit d20f463
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 2 deletions.
5 changes: 4 additions & 1 deletion src/csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
} from "./versions/common/csv.js"
import { CsvValidatorOneOne } from "./versions/1.1/csv.js"
import { CsvValidatorTwoZero } from "./versions/2.0/csv.js"
import { addErrorsToList } from "./utils.js"
import { addErrorsToList, removeBOM } from "./utils.js"

import Papa from "papaparse"

Expand Down Expand Up @@ -184,6 +184,9 @@ export async function validateCsv(
Papa.parse(input, {
header: false,
// chunkSize: 64 * 1024,
beforeFirstChunk: (chunk) => {
return removeBOM(chunk)
},
step: (row: Papa.ParseStepResult<string[]>, parser: Papa.Parser) => {
try {
handleParseStep(row, resolve, parser)
Expand Down
14 changes: 14 additions & 0 deletions src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,17 @@ export function addErrorsToList<T extends { warning?: boolean | undefined }>(

return counts
}

export function removeBOM(chunk: string): string {
// strip utf-8 BOM: see https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
const dataBuffer = Buffer.from(chunk)
if (
dataBuffer.length > 2 &&
dataBuffer[0] === 0xef &&
dataBuffer[1] === 0xbb &&
dataBuffer[2] === 0xbf
) {
chunk = chunk.trimStart()
}
return chunk
}
11 changes: 10 additions & 1 deletion src/versions/common/json.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { ErrorObject } from "ajv"
import { ValidationError } from "../../types.js"
import { JSONParser } from "@streamparser/json"
import { removeBOM } from "../../utils.js"

export async function parseJson(
jsonInput: File | NodeJS.ReadableStream,
Expand All @@ -25,12 +26,20 @@ export async function parseJson(

readChunk()
} else {
let firstChunk = true
const jsonStream = jsonInput as NodeJS.ReadableStream
jsonStream.on("end", () => parser.end())
jsonStream.on("error", (e) => {
throw e
})
jsonStream.on("data", (data) => parser.write(data))
jsonStream.on("data", (data: string) => {
// strip utf-8 BOM: see https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
if (firstChunk) {
data = removeBOM(data)
firstChunk = false
}
parser.write(data)
})
}
}

Expand Down
10 changes: 10 additions & 0 deletions test/2.0/csv.e2e.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ test("validateCsvTall", async (t) => {
t.deepEqual(result.errors.length, 0)
})

test("validateCsvTall quoted column name", async (t) => {
// this test shows correct behavior when a file contains a BOM and the first column name is quoted
const result = await validateCsv(
loadFixtureStream("/2.0/sample-tall-valid-quoted.csv"),
"v2.0"
)
t.is(result.valid, true)
t.deepEqual(result.errors.length, 0)
})

test("validateCsvWide", async (t) => {
const result = await validateCsv(
loadFixtureStream("/2.0/sample-wide-valid.csv"),
Expand Down
9 changes: 9 additions & 0 deletions test/2.0/json.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@ test("validateJson", async (t) => {
t.deepEqual(result.errors.length, 0)
})

test("validateJson BOM", async (t) => {
const result = await validateJson(
loadFixtureStream("/2.0/sample-valid-bom.json"),
"v2.0"
)
t.is(result.valid, true)
t.deepEqual(result.errors.length, 0)
})

test("validateJson empty", async (t) => {
const result = await validateJson(
loadFixtureStream("/2.0/sample-empty.json"),
Expand Down
16 changes: 16 additions & 0 deletions test/fixtures/2.0/sample-tall-valid-quoted.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"hospital_name",last_updated_on,version,hospital_location,hospital_address,license_number|CA,"To the best of its knowledge and belief, the hospital has included all applicable standard charge information in accordance with the requirements of 45 CFR 180.50, and the information encoded is true, accurate, and complete as of the date indicated.",,,,,,,,,,,,,,
West Mercy Hospital,2024-07-01,2.0,West Mercy Hospital|West Mercy Surgical Center,"12 Main Street, Fullerton, CA 92832|23 Ocean Ave, San Jose, CA 94088",50056,true,,,,,,,,,,,,,,
Description,code|1,code|1|type,code|2,code|2|type,modifiers,setting,drug_unit_of_measurement,drug_type_of_measurement,standard_charge|gross,standard_charge|discounted_cash,payer_name,plan_name,standard_charge|negotiated_dollar,standard_charge|negotiated_percentage,standard_charge|negotiated_algorithm,estimated_amount,standard_charge|methodology,standard_charge|min,standard_charge|max,additional_generic_notes
Major hip and knee joint replacement or reattachment of lower extremity without mcc,470,MS-DRG,175869,LOCAL,,inpatient,,,,,Platform Health Insurance,PPO,20000,,MS-DRG,22243.34,case rate,20000,20000,
Major hip and knee joint replacement or reattachment of lower extremity without mcc,470,MS-DRG,175869,LOCAL,,inpatient,,,,,Platform Health Insurance,PPO,20000,,https://www.cms.gov/Outreach-and-Education/Medicare-Learning-Network-MLN/MLNProducts/html/images/OP.jpg,22243.34,case rate,20000,20000,
Major hip and knee joint replacement or reattachment of lower extremity without mcc,470,MS-DRG,175869,LOCAL,,inpatient,,,,,Platform Health Insurance,PPO,20000,,The adjusted base payment rate indicated in the standard_charge|negotiated_dollar data element may be further adjusted for additional factors including transfers and outliers.,22243.34,case rate,20000,20000,
Major hip and knee joint replacement or reattachment of lower extremity without mcc,470,MS-DRG,175869,LOCAL,,inpatient,,,,,Region Health Insurance,HMO,,50,,23145.98,percent of total billed charges,20000,20000,
"Evaluation of hearing function to determine candidacy for, or postoperative status of, surgically implanted hearing device; first hour",92626,CPT,,,,outpatient,,,150,125,Platform Health Insurance,PPO,98.98,,,,fee schedule,98.98,98.98,110% of the Medicare fee schedule
"Evaluation of hearing function to determine candidacy for, or postoperative status of, surgically implanted hearing device; first hour",92626,CPT,,,,outpatient,,,150,125,Region Health Insurance,HMO,,115,,105.34,fee schedule,98.98,98.98,115% of the state's workers' compensation amount
"Behavioral health; residential (hospital residential treatment program), without room and board, per diem",H0017,HCPCS,,,,inpatient,,,2500,2250,Platform Health Insurance,PPO,1500,,,,per diem,1500,1500,
"Behavioral health; residential (hospital residential treatment program), without room and board, per diem, days 1-3",H0017,hcpcs,,,,inpatient,,,2500,2250,Region Health Insurance,HMO,2000,,,,per diem,2000,2000,
"Behavioral health; residential (hospital residential treatment program), without room and board, per diem, days 4-5",H0017,HCPCS,,,,inpatient,,,2500,2250,Region Health Insurance,HMO,1800,,,,per diem,1800,1800,
"Behavioral health; residential (hospital residential treatment program), without room and board, per diem, days 6+",H0017,HCPCS,,,,inpatient,,,2500,2250,Region Health Insurance,HMO,1200,,,,per diem,1200,1200,
Treatment or observation room — observation room,762,RC,,,,outpatient,,,13000,12000,Platform Health Insurance,PPO,8000,,,,case rate,8000,10000,Negotiated standard charge without surgery and without rule out myocardial infarction
Treatment or observation room — observation room,762,RC,,,,outpatient,,,13000,12000,Platform Health Insurance,PPO,10000,,,,case rate,8000,10000,Negotiated standard charge without surgery and with rule out myocardial infarction
Treatment or observation room — observation room,762,RC,,,,outpatient,,,13000,12000,Region Health Insurance,HMO,9000,,,,case rate,8000,10000,
195 changes: 195 additions & 0 deletions test/fixtures/2.0/sample-valid-bom.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
{
"hospital_name": "West Mercy Hospital",
"last_updated_on": "2024-07-01",
"version": "2.0.0",
"hospital_location": ["West Mercy Hospital", "West Mercy Surgical Center"],
"hospital_address": [
"12 Main Street, Fullerton, CA 92832",
"23 Ocean Ave, San Jose, CA 94088"
],
"license_information": {
"license_number": "50056",
"state": "CA"
},
"affirmation": {
"affirmation": "To the best of its knowledge and belief, the hospital has included all applicable standard charge information in accordance with the requirements of 45 CFR 180.50, and the information encoded is true, accurate, and complete as of the date indicated.",
"confirm_affirmation": true
},
"standard_charge_information": [
{
"description": "Major hip and knee joint replacement or reattachment of lower extremity without mcc",
"code_information": [
{
"code": "470",
"type": "MS-DRG"
},
{
"code": "175869",
"type": "LOCAL"
}
],
"standard_charges": [
{
"minimum": 20000,
"maximum": 20000,
"setting": "inpatient",
"payers_information": [
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 20000,
"standard_charge_algorithm": "MS-DRG",
"estimated_amount": 22243.34,
"methodology": "case rate"
},
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 20000,
"standard_charge_algorithm": "https://www.cms.gov/Outreach-and-Education/Medicare-Learning-Network-MLN/MLNProducts/html/images/OP.jpg",
"estimated_amount": 22243.34,
"methodology": "case rate"
},
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 20000,
"standard_charge_algorithm": "The adjusted base payment rate indicated in the standard_charge|negotiated_dollar data element may be further adjusted for additional factors including transfers and outliers.",
"estimated_amount": 22243.34,
"methodology": "case rate"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_percentage": 50,
"estimated_amount": 23145.98,
"methodology": "percent of total billed charges"
}
]
}
]
},
{
"description": "Evaluation of hearing function to determine candidacy for, or postoperative status of, surgically implanted hearing device; first hour",
"code_information": [
{
"code": "92626",
"type": "CPT"
}
],
"standard_charges": [
{
"setting": "outpatient",
"gross_charge": 150,
"discounted_cash": 125,
"minimum": 98.98,
"maximum": 98.98,
"payers_information": [
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 98.98,
"methodology": "fee schedule",
"additional_payer_notes": "110% of the Medicare fee schedule"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_percentage": 115,
"estimated_amount": 105.34,
"methodology": "fee schedule",
"additional_payer_notes": "115% of the state's workers' compensation amount"
}
]
}
]
},
{
"description": "Behavioral health; residential (hospital residential treatment program), without room and board, per diem",
"code_information": [
{
"code": "H0017",
"type": "HCPCS"
}
],
"standard_charges": [
{
"gross_charge": 2500,
"discounted_cash": 2250,
"minimum": 1200,
"maximum": 2000,
"setting": "inpatient",
"payers_information": [
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 1500,
"methodology": "per diem"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_dollar": 2000,
"methodology": "per diem",
"additional_payer_notes": "per diem, days 1-3"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_dollar": 1800,
"methodology": "per diem",
"additional_payer_notes": "per diem, days 4-5"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_dollar": 1200,
"methodology": "per diem",
"additional_payer_notes": "per diem, days 6+"
}
]
}
]
},
{
"description": "Treatment or observation room — observation room",
"code_information": [
{
"code": "762",
"type": "RC"
}
],
"standard_charges": [
{
"gross_charge": 13000,
"discounted_cash": 12000,
"minimum": 8000,
"maximum": 10000,
"setting": "outpatient",
"payers_information": [
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 8000,
"methodology": "case rate",
"additional_payer_notes": "Negotiated standard charge without surgery and without rule out myocardial infarction"
},
{
"payer_name": "Platform Health Insurance",
"plan_name": "PPO",
"standard_charge_dollar": 10000,
"methodology": "case rate",
"additional_payer_notes": "Negotiated standard charge without surgery and with rule out myocardial infarction"
},
{
"payer_name": "Region Health Insurance",
"plan_name": "HMO",
"standard_charge_dollar": 9000,
"methodology": "case rate"
}
]
}
]
}
]
}

0 comments on commit d20f463

Please sign in to comment.