diff --git a/.typos.toml b/.typos.toml index 6cf9781affdc..d6c6fcd09e1d 100644 --- a/.typos.toml +++ b/.typos.toml @@ -12,6 +12,7 @@ "ser" = "ser" "Ser" = "Ser" "flate" = "flate" +"Tke" = "Tke" [files] extend-exclude = [ diff --git a/Cargo.lock b/Cargo.lock index fa69ca151cd2..1aed75fd3f8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3656,6 +3656,8 @@ dependencies = [ "chrono-tz 0.8.6", "databend-common-base", "databend-common-exception", + "enquote", + "enumflags2", "ethnum", "geo", "geos", diff --git a/src/common/io/Cargo.toml b/src/common/io/Cargo.toml index 9af4a6b20b91..b32564997b6b 100644 --- a/src/common/io/Cargo.toml +++ b/src/common/io/Cargo.toml @@ -18,6 +18,8 @@ chrono = { workspace = true } chrono-tz = { workspace = true } databend-common-base = { workspace = true } databend-common-exception = { workspace = true } +enquote = "1.1.0" +enumflags2 = { workspace = true } ethnum = { workspace = true } geo = { workspace = true } geos = { workspace = true } diff --git a/src/common/io/src/lib.rs b/src/common/io/src/lib.rs index 7b337d75595f..5836169e42f8 100644 --- a/src/common/io/src/lib.rs +++ b/src/common/io/src/lib.rs @@ -26,6 +26,7 @@ pub mod constants; pub mod format_diagnostic; +pub mod number; pub mod prelude; mod binary_read; diff --git a/src/common/io/src/number.rs b/src/common/io/src/number.rs new file mode 100644 index 000000000000..4806a3a2acc7 --- /dev/null +++ b/src/common/io/src/number.rs @@ -0,0 +1,1035 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use enumflags2::bitflags; +use enumflags2::BitFlags; + +// Template Patterns for Numeric Formatting +// https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/formatting.c + +#[derive(Debug, Clone)] +struct KeyWord { + name: &'static str, + id: NumPoz, + // is_digit: bool, + // date_mode: FromCharDateMode, +} + +#[derive(Debug, Clone, Copy)] +#[expect(dead_code)] +enum NumPoz { + TkComma, + TkDec, + Tk0, + Tk9, + TkB, + TkC, + TkD, + TkE, + TkFM, + TkG, + TkL, + TkMI, + TkPL, + TkPR, + TkRN, + TkSG, + TkSP, + TkS, + TkTH, + TkV, + Tkb, + Tkc, + Tkd, + Tke, + Tkfm, + Tkg, + Tkl, + Tkmi, + Tkpl, + Tkpr, + Tkrn, + Tksg, + Tksp, + Tks, + Tkth, + Tkv, +} + +const NUM_KEYWORDS: [KeyWord; 36] = [ + KeyWord::new(",", NumPoz::TkComma), + KeyWord::new(".", NumPoz::TkDec), + KeyWord::new("0", NumPoz::Tk0), + KeyWord::new("9", NumPoz::Tk9), + KeyWord::new("B", NumPoz::TkB), + KeyWord::new("C", NumPoz::TkC), + KeyWord::new("D", NumPoz::TkD), + KeyWord::new("EEEE", NumPoz::TkE), + KeyWord::new("FM", NumPoz::TkFM), + KeyWord::new("G", NumPoz::TkG), + KeyWord::new("L", NumPoz::TkL), + KeyWord::new("MI", NumPoz::TkMI), + KeyWord::new("PL", NumPoz::TkPL), + KeyWord::new("PR", NumPoz::TkPR), + KeyWord::new("RN", NumPoz::TkRN), + KeyWord::new("SG", NumPoz::TkSG), + KeyWord::new("SP", NumPoz::TkSP), + KeyWord::new("S", NumPoz::TkS), + KeyWord::new("TH", NumPoz::TkTH), + KeyWord::new("V", NumPoz::TkV), + KeyWord::new("b", NumPoz::Tkb), + KeyWord::new("c", NumPoz::Tkc), + KeyWord::new("d", NumPoz::TkD), + KeyWord::new("eeee", NumPoz::TkE), + KeyWord::new("fm", NumPoz::TkFM), + KeyWord::new("g", NumPoz::TkG), + KeyWord::new("l", NumPoz::TkL), + KeyWord::new("mi", NumPoz::TkMI), + KeyWord::new("pl", NumPoz::TkPL), + KeyWord::new("pr", NumPoz::TkPR), + KeyWord::new("rn", NumPoz::Tkrn), + KeyWord::new("sg", NumPoz::TkSG), + KeyWord::new("sp", NumPoz::TkSP), + KeyWord::new("s", NumPoz::TkS), + KeyWord::new("th", NumPoz::Tkth), + KeyWord::new("v", NumPoz::TkV), +]; + +// ---------- +// Flags for NUMBER version +// ---------- +#[bitflags] +#[repr(u16)] +#[derive(Clone, Copy, Debug)] +enum NumFlag { + Decimal, + LDecimal, + Zero, + Blank, + FillMode, + LSign, + Bracket, + Minus, + Plus, + Roman, + Multi, + PlusPost, + MinusPost, + Eeee, +} + +#[derive(Debug, Clone, Copy)] +enum NumLSign { + Pre, + Post, +} + +impl KeyWord { + const fn new(name: &'static str, id: NumPoz) -> KeyWord { + KeyWord { name, id } + } +} + +#[derive(Debug)] +#[expect(dead_code)] + +enum FormatNode { + End, + Action(KeyWord), + Char(String), + Separator, + Space, +} + +// ---------- +// Number description struct +// ---------- +#[derive(Default, Debug, Clone)] +struct NumDesc { + pre: usize, // (count) numbers before decimal + post: usize, // (count) numbers after decimal + lsign: Option, // want locales sign + flag: BitFlags, // number parameters + pre_lsign_num: usize, // tmp value for lsign + multi: usize, // multiplier for 'V' + zero_start: usize, // position of first zero + zero_end: usize, // position of last zero + need_locale: bool, // needs it locale +} + +impl NumDesc { + fn prepare(&mut self, n: &FormatNode) -> std::result::Result<(), &'static str> { + if let FormatNode::Action(key) = n { + if self.flag.contains(NumFlag::Eeee) && !matches!(key.id, NumPoz::TkE) { + return Err("\"EEEE\" must be the last pattern used"); + } + + match key.id { + NumPoz::Tk9 => { + if self.flag.contains(NumFlag::Bracket) { + return Err("\"9\" must be ahead of \"PR\""); + } + + if self.flag.contains(NumFlag::Multi) { + self.multi += 1; + return Ok(()); + } + + if self.flag.contains(NumFlag::Decimal) { + self.post += 1; + } else { + self.pre += 1; + } + Ok(()) + } + + NumPoz::Tk0 => { + if self.flag.contains(NumFlag::Bracket) { + return Err("\"0\" must be ahead of \"PR\""); + } + + if !self.flag.intersects(NumFlag::Zero | NumFlag::Decimal) { + self.flag.insert(NumFlag::Zero); + self.zero_start = self.pre + 1; + } + + if !self.flag.contains(NumFlag::Decimal) { + self.pre += 1; + } else { + self.post += 1; + } + + self.zero_end = self.pre + self.post; + Ok(()) + } + + NumPoz::TkB => { + if self.pre == 0 && self.post == 0 && !self.flag.contains(NumFlag::Zero) { + self.flag.insert(NumFlag::Blank) + } + Ok(()) + } + + NumPoz::TkD => { + self.flag.insert(NumFlag::LDecimal); + self.need_locale = true; + + if self.flag.contains(NumFlag::Decimal) { + return Err("multiple decimal points"); + } + if self.flag.contains(NumFlag::Multi) { + return Err("cannot use \"V\" and decimal point together"); + } + + self.flag.insert(NumFlag::Decimal); + Ok(()) + } + + NumPoz::TkDec => { + if self.flag.contains(NumFlag::Decimal) { + return Err("multiple decimal points"); + } + if self.flag.contains(NumFlag::Multi) { + return Err("cannot use \"V\" and decimal point together"); + } + + self.flag.insert(NumFlag::Decimal); + Ok(()) + } + + NumPoz::TkFM => { + self.flag.insert(NumFlag::FillMode); + Ok(()) + } + + NumPoz::TkS => { + if self.flag.contains(NumFlag::LSign) { + return Err("cannot use \"S\" twice"); + } + if self + .flag + .intersects(NumFlag::Plus | NumFlag::Minus | NumFlag::Bracket) + { + return Err("cannot use \"S\" and \"PL\"/\"MI\"/\"SG\"/\"PR\" together"); + } + + if self.flag.contains(NumFlag::Decimal) { + self.lsign = Some(NumLSign::Pre); + self.pre_lsign_num = self.pre; + self.need_locale = true; + self.flag.insert(NumFlag::LSign); + return Ok(()); + } + + if self.lsign.is_none() { + self.lsign = Some(NumLSign::Post); + self.need_locale = true; + self.flag.insert(NumFlag::LSign); + } + Ok(()) + } + + NumPoz::TkMI => { + if self.flag.contains(NumFlag::LSign) { + return Err("cannot use \"S\" and \"MI\" together"); + } + + self.flag.insert(NumFlag::Minus); + if self.flag.contains(NumFlag::Decimal) { + self.flag.insert(NumFlag::MinusPost) + } + Ok(()) + } + + NumPoz::TkPL => { + if self.flag.contains(NumFlag::LSign) { + return Err("cannot use \"S\" and \"PL\" together"); + } + + self.flag.insert(NumFlag::Plus); + if self.flag.contains(NumFlag::Decimal) { + self.flag.insert(NumFlag::PlusPost) + } + Ok(()) + } + + NumPoz::TkSG => { + if self.flag.contains(NumFlag::LSign) { + return Err("cannot use \"S\" and \"SG\" together"); + } + self.flag.insert(NumFlag::Plus | NumFlag::Minus); + Ok(()) + } + + NumPoz::TkPR => { + if self + .flag + .intersects(NumFlag::LSign | NumFlag::Plus | NumFlag::Minus) + { + return Err("cannot use \"PR\" and \"S\"/\"PL\"/\"MI\"/\"SG\" together"); + } + + self.flag.insert(NumFlag::Bracket); + Ok(()) + } + + NumPoz::Tkrn | NumPoz::TkRN => { + self.flag.insert(NumFlag::Roman); + Ok(()) + } + + NumPoz::TkL | NumPoz::TkG => { + self.need_locale = true; + Ok(()) + } + + NumPoz::TkV => { + if self.flag.contains(NumFlag::Decimal) { + return Err("cannot use \"V\" and decimal point together"); + } + self.flag.insert(NumFlag::Multi); + Ok(()) + } + + NumPoz::TkE => { + if self.flag.contains(NumFlag::Eeee) { + return Err("cannot use \"EEEE\" twice"); + } + + if self.flag.intersects( + NumFlag::Blank + | NumFlag::FillMode + | NumFlag::LSign + | NumFlag::Bracket + | NumFlag::Minus + | NumFlag::Plus + | NumFlag::Roman + | NumFlag::Multi, + ) { + return Err("\"EEEE\" is incompatible with other formats"); + } + + self.flag.insert(NumFlag::Eeee); + Ok(()) + } + + NumPoz::TkComma => Ok(()), + + _ => unreachable!(), + } + } else { + unreachable!() + } + } + + fn i64_to_num_part(&self, value: i64) -> Result { + if self.flag.contains(NumFlag::Roman) { + return Err(ErrorCode::Unimplemented("to_char RN (Roman numeral)")); + } + + if self.flag.contains(NumFlag::Eeee) { + // we can do it easily because f64 won't lose any precision + let number = format!("{:+.*e}", self.post, value as f64); + + // Swap a leading positive sign for a space. + let number = number.replace("+", " "); + + return Ok(NumPart { + sign: value >= 0, + number, + out_pre_spaces: 0, + }); + } + + if self.flag.contains(NumFlag::Multi) { + return Err(ErrorCode::Unimplemented("to_char V (multiplies)")); + } + + let mut orgnum = if value == i64::MIN { + format!("{}", -(i64::MIN as i128)) + } else { + format!("{}", value.abs()) + }; + + let numstr_pre_len = orgnum.len(); + + // post-decimal digits? Pad out with zeros. + if self.post > 0 { + orgnum.push('.'); + orgnum.push_str(&"0".repeat(self.post)) + } + + let (number, out_pre_spaces) = match numstr_pre_len.cmp(&self.pre) { + // needs padding? + std::cmp::Ordering::Less => (orgnum, self.pre - numstr_pre_len), + // overflowed prefix digit format? + std::cmp::Ordering::Greater => { + (["#".repeat(self.pre), "#".repeat(self.post)].join("."), 0) + } + std::cmp::Ordering::Equal => (orgnum, 0), + }; + + Ok(NumPart { + sign: value >= 0, + number, + out_pre_spaces, + }) + } + + fn f64_to_num_part(&mut self, value: f64) -> Result { + if self.flag.contains(NumFlag::Roman) { + return Err(ErrorCode::Unimplemented("to_char RN (Roman numeral)")); + } + + if self.flag.contains(NumFlag::Eeee) { + let number = if value.is_normal() { + let orgnum = format!("{:+.*e}", self.post, value); + // Swap a leading positive sign for a space. + orgnum.replace("+", " ") + } else { + // Allow 6 characters for the leading sign, the decimal point, + // "e", the exponent's sign and two exponent digits. + let mut orgnum = String::with_capacity(self.pre + self.post + 6); + orgnum.push(' '); + orgnum.push_str(&"#".repeat(self.pre)); + orgnum.push('.'); + orgnum.push_str(&"#".repeat(self.post + 4)); + orgnum + }; + return Ok(NumPart { + sign: !value.is_sign_negative(), + number, + out_pre_spaces: 0, + }); + } + + if self.flag.contains(NumFlag::Multi) { + return Err(ErrorCode::Unimplemented("to_char V (multiplies)")); + } + + let orgnum = format!("{:.0}", value.abs()); + let numstr_pre_len = orgnum.len(); + + const FLT_DIG: usize = 6; + // adjust post digits to fit max float digits + if numstr_pre_len >= FLT_DIG { + self.post = 0; + } else if numstr_pre_len + self.post > FLT_DIG { + self.post = FLT_DIG - numstr_pre_len; + } + let orgnum = format!("{:.*}", self.post, value.abs()); + + let numstr_pre_len = match orgnum.find('.') { + Some(p) => p, + None => orgnum.len(), + }; + + let (number, out_pre_spaces) = match numstr_pre_len.cmp(&self.pre) { + // needs padding? + std::cmp::Ordering::Less => (orgnum, self.pre - numstr_pre_len), + // overflowed prefix digit format? + std::cmp::Ordering::Greater => { + (["#".repeat(self.pre), "#".repeat(self.post)].join("."), 0) + } + std::cmp::Ordering::Equal => (orgnum, 0), + }; + + Ok(NumPart { + sign: !value.is_sign_negative(), + number, + out_pre_spaces, + }) + } +} + +struct NumPart { + sign: bool, + number: String, + out_pre_spaces: usize, +} + +fn parse_format( + mut str: &str, + kw: &[KeyWord], + mut num: Option<&mut NumDesc>, +) -> Result> { + let mut nodes = Vec::new(); + while !str.is_empty() { + if let Some(remain) = str.strip_prefix(' ') { + str = remain; + nodes.push(FormatNode::Space); + continue; + } + + if str.starts_with('"') { + let (offset, literal) = + parse_literal_string(str).map_err(|e| ErrorCode::SyntaxException(e.to_string()))?; + nodes.push(FormatNode::Char(literal)); + str = &str[offset..]; + continue; + } + + if let Some(k) = kw.iter().find(|k| str.starts_with(k.name)) { + let n = FormatNode::Action(k.clone()); + + if let Some(num) = num.as_mut() { + num.prepare(&n).map_err(ErrorCode::SyntaxException)?; + } + str = &str[k.name.len()..]; + + nodes.push(n); + continue; + } + + Err(ErrorCode::SyntaxException( + "Currently only key words are supported".to_string(), + ))?; + } + Ok(nodes) +} + +fn parse_literal_string(data: &str) -> std::result::Result<(usize, String), enquote::Error> { + let mut escape = false; + for (i, ch) in data.char_indices() { + if i == 0 { + continue; + } + match ch { + '"' if !escape => { + let end = i + 1; + return enquote::unquote(&data[..end]).map(|s| (end, s)); + } + '\\' if !escape => escape = true, + _ if escape => escape = false, + _ => {} + } + } + Err(enquote::Error::UnexpectedEOF) +} + +struct NumProc { + desc: NumDesc, // number description + + sign: bool, // '-' or '+' + sign_wrote: bool, // was sign write + num_count: usize, // number of write digits + num_in: bool, // is inside number + num_curr: usize, // current position in number + out_pre_spaces: usize, // spaces before first digit + + _read_dec: bool, // to_number - was read dec. point + _read_post: usize, // to_number - number of dec. digit + _read_pre: usize, // to_number - number non-dec. digit + + number: Vec, + number_p: usize, + + inout: String, + + last_relevant: Option<(char, usize)>, // last relevant number after decimal point + + decimal: String, + loc_negative_sign: String, + loc_positive_sign: String, + _loc_thousands_sep: String, + _loc_currency_symbol: String, +} + +impl NumProc { + // ---------- + // Add digit or sign to number-string + // ---------- + fn numpart_to_char(&mut self, id: NumPoz) { + // Write sign if real number will write to output Note: IS_PREDEC_SPACE() + // handle "9.9" --> " .1" + if !self.sign_wrote + && (self.num_curr >= self.out_pre_spaces + || self.desc.flag.contains(NumFlag::Zero) && self.desc.zero_start == self.num_curr) + && (!self.is_predec_space() || self.last_relevant.is_some()) + { + if self.desc.flag.contains(NumFlag::LSign) { + if matches!(self.desc.lsign, Some(NumLSign::Pre)) { + if self.sign { + self.inout.push_str(&self.loc_positive_sign) + } else { + self.inout.push_str(&self.loc_negative_sign) + } + self.sign_wrote = true; + } + } else if self.desc.flag.contains(NumFlag::Bracket) { + if self.sign { + self.inout.push(' ') + } else { + self.inout.push('<') + } + self.sign_wrote = true; + } else if self.sign { + if !self.desc.flag.contains(NumFlag::FillMode) { + self.inout.push(' '); /* Write + */ + } + self.sign_wrote = true; + } else { + self.inout.push('-'); /* Write - */ + self.sign_wrote = true; + } + } + + // digits / FM / Zero / Dec. point + if matches!(id, NumPoz::Tk9 | NumPoz::Tk0 | NumPoz::TkDec | NumPoz::TkD) { + if self.num_curr < self.out_pre_spaces + && (self.desc.zero_start > self.num_curr || !self.desc.flag.contains(NumFlag::Zero)) + { + // Write blank space + if !self.desc.flag.contains(NumFlag::FillMode) { + self.inout.push(' ') /* Write ' ' */ + } + } else if self.desc.flag.contains(NumFlag::Zero) + && self.num_curr < self.out_pre_spaces + && self.desc.zero_start <= self.num_curr + { + // Write ZERO + self.inout.push('0'); /* Write '0' */ + self.num_in = true + } else { + // Write Decimal point + if self.number.get(self.number_p).is_some_and(|c| *c == '.') { + if !self.last_relevant_is_dot() + || self.desc.flag.contains(NumFlag::FillMode) && self.last_relevant_is_dot() + // Ora 'n' -- FM9.9 --> 'n.'s + { + self.inout.push_str(&self.decimal) /* Write DEC/D */ + } + } else if self.last_relevant.is_some_and(|(_, i)| self.number_p > i) + && !matches!(id, NumPoz::Tk0) + { + } + // '0.1' -- 9.9 --> ' .1' + else if self.is_predec_space() { + if !self.desc.flag.contains(NumFlag::FillMode) { + self.inout.push(' '); + } + // '0' -- FM9.9 --> '0.' + else if self.last_relevant_is_dot() { + self.inout.push('0') + } + } else if self.number_p < self.number.len() { + self.inout.push(self.number[self.number_p]); /* Write DIGIT */ + self.num_in = true + } + if self.number_p < self.number.len() { + self.number_p += 1; + } + } + + let end = self.num_count + + if self.out_pre_spaces > 0 { 1 } else { 0 } + + if self.desc.flag.contains(NumFlag::Decimal) { + 1 + } else { + 0 + }; + + let end = if self.last_relevant.is_some_and(|(_, i)| i == self.number_p) { + self.num_curr + } else { + end + }; + + if self.num_curr + 1 == end { + if self.sign_wrote && self.desc.flag.contains(NumFlag::Bracket) { + self.inout.push(if self.sign { ' ' } else { '>' }) + } else if self.desc.flag.contains(NumFlag::LSign) + && matches!(self.desc.lsign, Some(NumLSign::Post)) + { + self.inout.push_str(if self.sign { + &self.loc_positive_sign + } else { + &self.loc_negative_sign + }) + } + } + } + + self.num_curr += 1; + } + + fn is_predec_space(&self) -> bool { + !self.desc.flag.contains(NumFlag::Zero) + && self.number_p == 0 + && self.number[0] == '0' + && self.desc.post != 0 + } + + fn calc_last_relevant_decnum(&mut self) { + let mut n = None; + for (i, c) in self.number.iter().enumerate() { + match n.as_ref() { + Some(_) if *c != '0' => n = Some(i), + None if *c == '.' => n = Some(i), + _ => {} + } + } + self.last_relevant = n.map(|n| (*self.number.get(n).unwrap(), n)); + } + + fn last_relevant_is_dot(&self) -> bool { + self.last_relevant.is_some_and(|(c, _)| c == '.') + } +} + +fn num_processor(nodes: &[FormatNode], desc: NumDesc, num_part: NumPart) -> Result { + let NumPart { + sign, + number, + out_pre_spaces, + } = num_part; + let mut np = NumProc { + desc, + sign, + sign_wrote: false, + num_count: 0, + num_in: false, + num_curr: 0, + out_pre_spaces, + _read_dec: false, + _read_post: 0, + _read_pre: 0, + number: number.chars().collect(), + number_p: 0, + inout: String::new(), + last_relevant: None, + decimal: ".".to_string(), + loc_negative_sign: String::new(), + loc_positive_sign: String::new(), + _loc_thousands_sep: String::new(), + _loc_currency_symbol: String::new(), + }; + + if np.desc.zero_start > 0 { + np.desc.zero_start -= 1; + } + + if np.desc.flag.contains(NumFlag::Eeee) { + return Ok(String::from_iter(np.number.iter())); + } + + // Roman correction + if np.desc.flag.contains(NumFlag::Roman) { + unimplemented!() + } + + // Sign + + // MI/PL/SG - write sign itself and not in number + if np.desc.flag.intersects(NumFlag::Plus | NumFlag::Minus) { + // if np.desc.flag.contains(NumFlag::Plus) && !np.desc.flag.contains(NumFlag::Minus) { + // np.sign_wrote = false; /* need sign */ + // } else { + // TODO: Why is this not the same as the postgres implementation? + np.sign_wrote = true; /* needn't sign */ + // } + } else { + if np.sign && np.desc.flag.contains(NumFlag::FillMode) { + np.desc.flag.remove(NumFlag::Bracket) + } + + if np.sign + && np.desc.flag.contains(NumFlag::FillMode) + && !np.desc.flag.contains(NumFlag::LSign) + { + np.sign_wrote = true /* needn't sign */ + } else { + np.sign_wrote = false /* need sign */ + } + if matches!(np.desc.lsign, Some(NumLSign::Pre)) && np.desc.pre == np.desc.pre_lsign_num { + np.desc.lsign = Some(NumLSign::Post) + } + } + + // Count + np.num_count = np.desc.post + np.desc.pre - 1; + + if np.desc.flag.contains(NumFlag::FillMode) && np.desc.flag.contains(NumFlag::Decimal) { + np.calc_last_relevant_decnum(); + + // If any '0' specifiers are present, make sure we don't strip + // those digits. But don't advance last_relevant beyond the last + // character of the np.number string, which is a hazard if the + // number got shortened due to precision limitations. + if let Some(last_relevant) = np.last_relevant { + if np.desc.zero_end > np.out_pre_spaces { + // note that np.number cannot be zero-length here + let last_zero_pos = np.number.len() - 1; + let last_zero_pos = last_zero_pos.min(np.desc.zero_end - np.out_pre_spaces); + + if last_relevant.1 < last_zero_pos { + let ch = np.number[last_zero_pos]; + np.last_relevant = Some((ch, last_zero_pos)) + } + } + } + } + + if !np.sign_wrote && np.out_pre_spaces == 0 { + np.num_count += 1; + } + + // Locale + if np.desc.need_locale { + // NUM_prepare_locale(Np); + return Err(ErrorCode::Unimplemented("to_char uses locale S/L/D/G")); + } + + // Processor direct cycle + for n in nodes.iter() { + match n { + // Format pictures actions + FormatNode::Action(key) => match key.id { + id @ (NumPoz::Tk9 | NumPoz::Tk0 | NumPoz::TkDec | NumPoz::TkD) => { + np.numpart_to_char(id) + } + + NumPoz::TkComma => { + if np.num_in { + np.inout.push(','); + continue; + } + if !np.desc.flag.contains(NumFlag::FillMode) { + np.inout.push(' ') + } + } + + NumPoz::TkMI => { + if np.sign { + if !np.desc.flag.contains(NumFlag::FillMode) { + np.inout.push(' '); + } + } else { + np.inout.push('-'); + } + } + + NumPoz::TkPL => { + if np.sign { + np.inout.push('+'); + } else if !np.desc.flag.contains(NumFlag::FillMode) { + np.inout.push(' '); + } + } + + NumPoz::TkSG => np.inout.push(if np.sign { '+' } else { '-' }), + + NumPoz::TkPR => (), + NumPoz::TkFM => (), + _ => unimplemented!(), + }, + FormatNode::End => break, + FormatNode::Char(character) => { + // In TO_CHAR, non-pattern characters in the format are copied to + // the output. + np.inout.push_str(character) + } + FormatNode::Space => np.inout.push(' '), + _ => unimplemented!(), + } + } + + Ok(np.inout) +} + +pub fn i64_to_char(value: i64, fmt: &str) -> Result { + // TODO: We should cache FormatNode + let mut desc = NumDesc::default(); + let nodes = parse_format(fmt, &NUM_KEYWORDS, Some(&mut desc))?; + + let num_part = desc.i64_to_num_part(value)?; + + num_processor(&nodes, desc, num_part) +} + +pub fn f64_to_char(value: f64, fmt: &str) -> Result { + // TODO: We should cache FormatNode + let mut desc = NumDesc::default(); + let nodes = parse_format(fmt, &NUM_KEYWORDS, Some(&mut desc))?; + + let num_part = desc.f64_to_num_part(value)?; + + num_processor(&nodes, desc, num_part) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_i64() -> Result<()> { + assert_eq!(" 123", i64_to_char(123, "999")?); + assert_eq!("-123", i64_to_char(-123, "999")?); + + assert_eq!(" 0123", i64_to_char(123, "0999")?); + assert_eq!("-0123", i64_to_char(-123, "0999")?); + + assert_eq!(" 123", i64_to_char(123, "99999")?); + assert_eq!(" -123", i64_to_char(-123, "99999")?); + + assert_eq!(" 0123", i64_to_char(123, "9990999")?); + assert_eq!(" -0123", i64_to_char(-123, "9990999")?); + + assert_eq!(" 0123 ", i64_to_char(123, "9990999PR")?); + assert_eq!(" <0123>", i64_to_char(-123, "9990999PR")?); + + assert_eq!(" 12345", i64_to_char(12345, "9990999")?); + assert_eq!(" -12345", i64_to_char(-12345, "9990999")?); + + assert_eq!(" 0012.0", i64_to_char(12, "9990999.9")?); + assert_eq!(" -0012.0", i64_to_char(-12, "9990999.9")?); + assert_eq!("0012.", i64_to_char(12, "FM9990999.9")?); + assert_eq!("-0012.", i64_to_char(-12, "FM9990999.9")?); + + assert_eq!(" ##", i64_to_char(123, "99")?); + assert_eq!("-##", i64_to_char(-123, "99")?); + + assert_eq!(" ##.", i64_to_char(123, "99.")?); + assert_eq!("-##.", i64_to_char(-123, "99.")?); + + assert_eq!(" ##.#", i64_to_char(123, "99.0")?); + assert_eq!("-##.#", i64_to_char(-123, "99.0")?); + + assert_eq!( + " 9223372036854775807", + i64_to_char(i64::MAX, "99999999999999999999")? + ); + assert_eq!( + " -9223372036854775808", + i64_to_char(i64::MIN, "99999999999999999999")? + ); + assert_eq!( + " -9223372036854775807", + i64_to_char(i64::MIN + 1, "99999999999999999999")? + ); + + // Regarding the way the exponent part of the scientific notation is formatted, + // there is a slight difference between the rust implementation and the c implementation. + // 1.23456000e+05 + assert_eq!(" 1.23456000e5", i64_to_char(123456, "9.99999999EEEE")?); + assert_eq!("-1.23456e5", i64_to_char(-123456, "9.99999EEEE")?); + + assert_eq!(" 4 8 5", i64_to_char(485, "9 9 9")?); + assert_eq!(" 1,485", i64_to_char(1485, "9,999")?); + + assert_eq!("Good number: 485", i64_to_char(485, "\"Good number:\"999")?); + + assert_eq!("+485", i64_to_char(485, "SG999")?); + assert_eq!("-485", i64_to_char(-485, "SG999")?); + assert_eq!("4-85", i64_to_char(-485, "9SG99")?); + + assert_eq!("+485", i64_to_char(485, "PL999")?); + assert_eq!(" 485", i64_to_char(-485, "PL999")?); + + assert_eq!("48+5", i64_to_char(485, "99PL9")?); + assert_eq!("48 5", i64_to_char(-485, "99PL9")?); + + assert_eq!("485-", i64_to_char(-485, "999MI")?); + assert_eq!("485 ", i64_to_char(485, "999MI")?); + assert_eq!("485", i64_to_char(485, "FM999MI")?); + + // assert_eq!(" 1 485", i64_to_char(1485, "9G999")?); + + Ok(()) + } + + #[test] + fn test_f64() -> Result<()> { + assert_eq!(" 12.34", f64_to_char(12.34, "99.99")?); + assert_eq!("-12.34", f64_to_char(-12.34, "99.99")?); + assert_eq!(" .10", f64_to_char(0.1, "99.99")?); + assert_eq!(" -.10", f64_to_char(-0.1, "99.99")?); + + assert_eq!(" 4.86e-4", f64_to_char(0.0004859, "9.99EEEE")?); + assert_eq!("-4.86e-4", f64_to_char(-0.0004859, "9.99EEEE")?); + + assert_eq!(" 0.1", f64_to_char(0.1, "0.9")?); + assert_eq!("-.1", f64_to_char(-0.1, "FM9.99")?); + assert_eq!("-0.1", f64_to_char(-0.1, "FM90.99")?); + + assert_eq!(" 148.500", f64_to_char(148.5, "999.999")?); + assert_eq!("148.5", f64_to_char(148.5, "FM999.999")?); + assert_eq!("148.500", f64_to_char(148.5, "FM999.990")?); + + assert_eq!( + "Pre: 485 Post: .800", + f64_to_char(485.8, "\"Pre:\"999\" Post:\" .999")? + ); + + // assert_eq!(" 148,500", f64_to_char(148.5, "999D999")?); + // assert_eq!(" 3 148,500", f64_to_char(3148.5, "9G999D999")?); + // assert_eq!("485-", f64_to_char(-485, "999S")?); + + // assert_eq!("DM 485", f64_to_char(485, "L999")?); + + // assert_eq!(" CDLXXXV", f64_to_char(485, "RN")?); + // assert_eq!("CDLXXXV", f64_to_char(485, "FMRN")?); + // assert_eq!("V", f64_to_char(5.2, "FMRN")?); + + // assert_eq!(" 482nd", f64_to_char(482, "999th")?); + + // assert_eq!(" 12000", f64_to_char(12, "99V999")?); + // assert_eq!(" 12400", f64_to_char(12.4, "99V999")?); + // assert_eq!(" 125", f64_to_char(12.45, "99V9")?); + + Ok(()) + } +} diff --git a/src/query/functions/src/scalars/other.rs b/src/query/functions/src/scalars/other.rs index 9adaf5529775..9d1c98b934e7 100644 --- a/src/query/functions/src/scalars/other.rs +++ b/src/query/functions/src/scalars/other.rs @@ -45,6 +45,7 @@ use databend_common_expression::types::StringType; use databend_common_expression::types::TimestampType; use databend_common_expression::types::ValueType; use databend_common_expression::vectorize_with_builder_1_arg; +use databend_common_expression::vectorize_with_builder_2_arg; use databend_common_expression::Column; use databend_common_expression::Domain; use databend_common_expression::EvalContext; @@ -58,6 +59,8 @@ use databend_common_expression::Scalar; use databend_common_expression::ScalarRef; use databend_common_expression::Value; use databend_common_expression::ValueRef; +use databend_common_io::number::f64_to_char; +use databend_common_io::number::i64_to_char; use rand::Rng; use rand::SeedableRng; @@ -73,6 +76,7 @@ pub fn register(registry: &mut FunctionRegistry) { register_inet_ntoa(registry); register_run_diff(registry); register_grouping(registry); + register_num_to_char(registry); registry.properties.insert( "rand".to_string(), @@ -384,6 +388,60 @@ fn register_grouping(registry: &mut FunctionRegistry) { }) } +fn register_num_to_char(registry: &mut FunctionRegistry) { + registry.register_passthrough_nullable_2_arg::( + "to_char", + |_, _, _| FunctionDomain::MayThrow, + vectorize_with_builder_2_arg::( + |value, fmt, builder, ctx| { + if let Some(validity) = &ctx.validity { + if !validity.get_bit(builder.len()) { + builder.commit_row(); + return; + } + } + + match i64_to_char(value, fmt) { + Ok(s) => { + builder.put_str(&s); + builder.commit_row() + } + Err(e) => { + ctx.set_error(builder.len(), e.to_string()); + builder.commit_row() + } + } + }, + ), + ); + + registry.register_passthrough_nullable_2_arg::( + "to_char", + |_, _, _| FunctionDomain::MayThrow, + vectorize_with_builder_2_arg::( + |value, fmt, builder, ctx| { + if let Some(validity) = &ctx.validity { + if !validity.get_bit(builder.len()) { + builder.commit_row(); + return; + } + } + + match f64_to_char(*value, fmt) { + Ok(s) => { + builder.put_str(&s); + builder.commit_row() + } + Err(e) => { + ctx.set_error(builder.len(), e.to_string()); + builder.commit_row() + } + } + }, + ), + ) +} + /// Compute `grouping` by `grouping_id` and `cols`. /// /// `cols` are indices of the column represented in `_grouping_id`. diff --git a/src/query/functions/tests/it/scalars/testdata/function_list.txt b/src/query/functions/tests/it/scalars/testdata/function_list.txt index 862816ad1205..5f4c60400817 100644 --- a/src/query/functions/tests/it/scalars/testdata/function_list.txt +++ b/src/query/functions/tests/it/scalars/testdata/function_list.txt @@ -3726,6 +3726,10 @@ Functions overloads: 21 to_boolean(Float32 NULL) :: Boolean NULL 22 to_boolean(Float64) :: Boolean 23 to_boolean(Float64 NULL) :: Boolean NULL +0 to_char(Int64, String) :: String +1 to_char(Int64 NULL, String NULL) :: String NULL +2 to_char(Float64, String) :: String +3 to_char(Float64 NULL, String NULL) :: String NULL 0 to_date(Variant) :: Date 1 to_date(Variant NULL) :: Date NULL 2 to_date(String, String) :: Date NULL diff --git a/tests/sqllogictests/suites/query/functions/02_0078_function_to_char.test b/tests/sqllogictests/suites/query/functions/02_0078_function_to_char.test new file mode 100644 index 000000000000..b73f95316b44 --- /dev/null +++ b/tests/sqllogictests/suites/query/functions/02_0078_function_to_char.test @@ -0,0 +1,177 @@ +# https://github.com/postgres/postgres/blob/master/src/test/regress/expected/int8.out + +statement ok +CREATE OR REPLACE TABLE INT64_TBL(q1 int64, q2 int64); + +statement ok +INSERT INTO INT64_TBL VALUES + ('123','456'), + ('123','4567890123456789'), + ('4567890123456789','123'), + (+4567890123456789,'4567890123456789'), + ('+4567890123456789','-4567890123456789'); + +# query T +# SELECT to_char(q1, '9G999G999G999G999G999'), to_char(q2, '9,999,999,999,999,999') FROM INT64_TBL; +#------------------------+------------------------ +# 123 | 456 +# 123 | 4,567,890,123,456,789 +# 4,567,890,123,456,789 | 123 +# 4,567,890,123,456,789 | 4,567,890,123,456,789 +# 4,567,890,123,456,789 | -4,567,890,123,456,789 + +# SELECT to_char(q1, '9G999G999G999G999G999D999G999'), to_char(q2, '9,999,999,999,999,999.999,999') FROM INT64_TBL; +query T +SELECT to_char(q1, '9,999,999,999,999,999.999,999'), to_char(q2, '9,999,999,999,999,999.999,999') FROM INT64_TBL; +---- + 123.000,000 456.000,000 + 123.000,000 4,567,890,123,456,789.000,000 + 4,567,890,123,456,789.000,000 123.000,000 + 4,567,890,123,456,789.000,000 4,567,890,123,456,789.000,000 + 4,567,890,123,456,789.000,000 -4,567,890,123,456,789.000,000 + +query T +SELECT to_char( (q1 * -1), '9999999999999999PR'), to_char( (q2 * -1), '9999999999999999.999PR') FROM INT64_TBL; +---- + <123> <456.000> + <123> <4567890123456789.000> + <4567890123456789> <123.000> + <4567890123456789> <4567890123456789.000> + <4567890123456789> 4567890123456789.000 + +# query T +# SELECT to_char( (q1 * -1), '9999999999999999S'), to_char( (q2 * -1), 'S9999999999999999') FROM INT64_TBL; +#-------------------+------------------- +# 123- | -456 +# 123- | -4567890123456789 +# 4567890123456789- | -123 +# 4567890123456789- | -4567890123456789 +# 4567890123456789- | +4567890123456789 + +query T +SELECT to_char(q2, 'MI9999999999999999') FROM INT64_TBL; +---- + 456 + 4567890123456789 + 123 + 4567890123456789 + -4567890123456789 + +# SELECT to_char(q2, 'FMS9999999999999999') FROM INT64_TBL; +query T +SELECT to_char(q2, 'FMSG9999999999999999') FROM INT64_TBL; +---- + +456 + +4567890123456789 + +123 + +4567890123456789 + -4567890123456789 + +# query T +# SELECT to_char(q2, 'FM9999999999999999THPR') FROM INT64_TBL; +#-------------------- +# 456TH +# 4567890123456789TH +# 123RD +# 4567890123456789TH +# <4567890123456789> + +# query T +# SELECT to_char(q2, 'SG9999999999999999th') FROM INT64_TBL; +#--------------------- +# + 456th +# +4567890123456789th +# + 123rd +# +4567890123456789th +# -4567890123456789 + +query T +SELECT to_char(q2, '0999999999999999') FROM INT64_TBL; +---- + 0000000000000456 + 4567890123456789 + 0000000000000123 + 4567890123456789 + -4567890123456789 + +# query T +# SELECT to_char(q2, 'S0999999999999999') FROM INT64_TBL; +#------------------- +# +0000000000000456 +# +4567890123456789 +# +0000000000000123 +# +4567890123456789 +# -4567890123456789 + +query T +SELECT to_char(q2, 'FM0999999999999999') FROM INT64_TBL; +---- + 0000000000000456 + 4567890123456789 + 0000000000000123 + 4567890123456789 + -4567890123456789 + +query T +SELECT to_char(q2, 'FM9999999999999999.000') FROM INT64_TBL; +---- + 456.000 + 4567890123456789.000 + 123.000 + 4567890123456789.000 + -4567890123456789.000 + +# query T +# SELECT to_char(q2, 'L9999999999999999.000') FROM INT64_TBL; +#------------------------ +# 456.000 +# 4567890123456789.000 +# 123.000 +# 4567890123456789.000 +# -4567890123456789.000 + +query T +SELECT to_char(q2, 'FM9999999999999999.999') FROM INT64_TBL; +---- + 456. + 4567890123456789. + 123. + 4567890123456789. + -4567890123456789. + + +# SELECT to_char(q2, 'S 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9') FROM INT64_TBL; +query T +SELECT to_char(q2, '9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9') FROM INT64_TBL; +---- + 4 5 6 . 0 0 0 + 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 . 0 0 0 + 1 2 3 . 0 0 0 + 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 . 0 0 0 + -4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 . 0 0 0 + +query T +SELECT to_char(q2, '99999 "text" 9999 "9999" 999 "\\"text between quote marks\\"" 9999') FROM INT64_TBL; +---- + text 9999 "text between quote marks" 456 + 45678 text 9012 9999 345 "text between quote marks" 6789 + text 9999 "text between quote marks" 123 + 45678 text 9012 9999 345 "text between quote marks" 6789 + -45678 text 9012 9999 345 "text between quote marks" 6789 + +query T +SELECT to_char(q2, '999999SG9999999999') FROM INT64_TBL; +---- + + 456 + 456789+0123456789 + + 123 + 456789+0123456789 + 456789-0123456789 + +query T +select to_char(123,'0099'); +---- + 0123 + +statement ok +DROP TABLE IF EXISTS INT64_TBL;