diff --git a/mathtranslate/config.py b/mathtranslate/config.py index 069cbca..ffb232c 100644 --- a/mathtranslate/config.py +++ b/mathtranslate/config.py @@ -45,3 +45,9 @@ def reread(): tencent_secret_key = read_variable(tencent_secret_key_path, tencent_secret_key_default) math_code = 'XMATHX' + +if os.path.exists(f'{ROOT}/TEST'): + test_environment = True + print('This is a test environment!') +else: + test_environment = False diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py index ffec42a..1c5e436 100644 --- a/mathtranslate/process_latex.py +++ b/mathtranslate/process_latex.py @@ -1,13 +1,40 @@ import re import regex -from .config import math_code +from .config import math_code, test_environment match_code = r"(" + math_code + r"_\d+(?:_\d+)*)" match_code_replace = math_code + r"_(\d+(?:_\d+)*)*" -pattern_env = r"\\begin\{(.*?)\}(.*?)\\end\{\1\}" # \begin{xxx} \end{xxx}, group 1: name, group 2: content + +pattern_env = r"\\begin\{(.*?)\}(\[[a-zA-Z\s,]*?\])?(.*?)\\end\{\1\}" # \begin{xxx} \end{xxx}, group 1: name, group 2: option, group 3: content pattern_command_full = r"\\([a-zA-Z]+\*?)(\[[a-zA-Z\s,]*?\])?(\{((?:[^{}]++|(?3))++)\})" # \xxx[xxx]{xxx} and \xxx{xxx}, group 1: name, group 2: option, group 4: content pattern_command_simple = r"\\([a-zA-Z]+)" # \xxx, group 1: name pattern_brace = r"\{((?:[^{}]++|(?0))++)\}" # {xxx}, group 1: content +pattern_theorem = r"\\newtheorem\{(.+?)\}" # \newtheorem{xxx}, group 1: name +pattern_accent = r"\\([`'\"^~=.])(?:\{([a-zA-Z])\}|([a-zA-Z]))" # match special characters with accents, group 1: accent, group 2/3: normal character +match_code_accent = rf'{math_code}([A-Z]{{2}})([a-zA-Z])' # group 1: accent name, group 2: normal character +list_special = ['\\', '%', '&', '#', '$', '{', '}', ' '] # all special characters in form of \x + +special_character_forward = { + '\\': 'BS', + '%': 'PC', + '&': 'AD', + '#': 'NB', + '$': 'DL', + '{': 'LB', + '}': 'RB', + '^': 'UT', + ' ': 'SP', + '`': 'BQ', + '~': 'TD', + "'": 'SQ', + '"': 'DQ', + '=': 'EQ', + '.': 'DT', + '*': 'ST', + '@': 'AT', +} +special_character_backward = {special_character_forward[key]: key for key in special_character_forward} +assert len(set(special_character_forward.values())) == len(special_character_forward) def variable_code(count): @@ -44,8 +71,8 @@ def modify_after(text): def replace_latex_objects(text): r""" - Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}", - applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts), + Replaces all LaTeX objects in a given text with the format "{math_code}_{digit1}_{digit2}_..._{digit_last}", + applies a given function to the resulting text (excluding the "{math_code}_{digit1}_{digit2}_..._{digit_last}" parts), and returns both the processed text and a list of replaced LaTeX objects. Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$, $ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx. @@ -64,7 +91,7 @@ def replace_latex_objects(text): pattern_brace, # {xxx} ] - # iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}" + # iterate through each LaTeX object and replace with "{math_code}_{digit1}_{digit2}_..._{digit_last}" count = 0 replaced_objs = [] for regex_symbol in latex_obj_regex: @@ -89,7 +116,8 @@ def get_obj(digit_str): if index < nobjs: return replaced_objs[index] else: - #assert final + if test_environment: + assert final return '???' text = modify_text(text, modify_after) @@ -143,10 +171,13 @@ def process_specific_env(latex, function, env_names): def process_function(match): # \begin{env_name} content \end{env_name} env_name = match.group(1) - content = match.group(2) + options = match.group(2) + if options is None: + options = '' + content = match.group(3) if env_name in env_names: processed_content = function(content) - return rf'\begin{{{env_name}}}{processed_content}\end{{{env_name}}}' + return rf'\begin{{{env_name}}}{options}{processed_content}\end{{{env_name}}}' else: return match.group(0) return pattern.sub(process_function, latex) @@ -214,3 +245,55 @@ def is_complete(latex_code): return False return True + + +def get_theorems(text): + pattern = re.compile(pattern_theorem, re.DOTALL) + matches = re.finditer(pattern, text) + theorems = [match.group(1) for match in matches] + return theorems + + +def replace_special(text): + for special in list_special: + # add space around + text = text.replace(f'\\{special}', f' {math_code}{special_character_forward[special]} ') + + return text + + +def recover_special(text): + for special in list_special: + text = text.replace(math_code + special_character_forward[special], f'\\{special}') + + return text + + +def replace_accent(text): + def replace_function(match): + special = match.group(1) + char1 = match.group(2) + char2 = match.group(3) + if char1 is None: + assert char2 is not None + char = char2 + else: + assert char2 is None + char = char1 + # do not add space around + return math_code + special_character_forward[special] + f'{char}' + + text = re.compile(pattern_accent).sub(replace_function, text) + + return text + + +def recover_accent(text): + def replace_function(match): + special = special_character_backward[match.group(1)] + char = match.group(2) + return rf'\{special}{{{char}}}' + + text = re.compile(match_code_accent).sub(replace_function, text) + + return text diff --git a/mathtranslate/translate.py b/mathtranslate/translate.py index 08ca964..2267056 100644 --- a/mathtranslate/translate.py +++ b/mathtranslate/translate.py @@ -150,7 +150,12 @@ def translate_latex_commands(self, latex_original, names, complete): def translate_full_latex(self, latex_original, loadmain=False): latex_original = process_latex.remove_tex_comments(latex_original) + + latex_original = process_latex.replace_accent(latex_original) + latex_original = process_latex.replace_special(latex_original) + complete = process_latex.is_complete(latex_original) + theorems = process_latex.get_theorems(latex_original) if complete: print('It is a full latex document') latex_original, tex_begin, tex_end = process_latex.split_latex_document(latex_original, r'\begin{document}', r'\end{document}') @@ -163,13 +168,6 @@ def translate_full_latex(self, latex_original, loadmain=False): tex_begin = default_begin tex_end = default_end - # It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back - latex_original = latex_original.replace(r'\{', f'{math_code}LB') - latex_original = latex_original.replace(r'\}', f'{math_code}RB') - # The following two can probably be put somewhere else - latex_original = latex_original.replace(r'\%', f'{math_code}PC') - latex_original = latex_original.replace(r'\ ', f'{math_code}SP') - if loadmain: latex_translated = open('text_after_main.txt').read() else: @@ -183,31 +181,29 @@ def translate_full_latex(self, latex_original, loadmain=False): latex_translated_paragraphs.append(latex_translated_paragraph) print(num, '/', len(latex_original_paragraphs)) num += 1 + latex_translated = '\n\n'.join(latex_translated_paragraphs) if self.debug: print(latex_translated, file=open('text_after_main.txt', 'w')) # TODO: add more here - environment_list = ['abstract', 'acknowledgments', 'itemize', 'enumerate', 'description', 'list'] - # addition: ['theorem', 'proposition', 'conjecture', 'lemma', 'claim', 'fact', 'corollary', 'remark', 'definition', 'example', 'proof'] + environment_list = ['abstract', 'acknowledgments', 'itemize', 'enumerate', 'description', 'list', 'proof'] print('processing latex environments') - latex_translated = self.translate_latex_env(latex_translated, environment_list, complete) + latex_translated = self.translate_latex_env(latex_translated, environment_list + theorems, complete) command_list = ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote', 'paragraph'] print('processing latex commands') latex_translated = self.translate_latex_commands(latex_translated, command_list, complete) - latex_translated = latex_translated.replace(f'{math_code}LB', r'\{') - latex_translated = latex_translated.replace(f'{math_code}RB', r'\}') - latex_translated = latex_translated.replace(f'{math_code}PC', r'\%') - latex_translated = latex_translated.replace(f'{math_code}SP', r'\ ') - latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end # Title is probably outside the body part print('processing title') latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete) + latex_translated = process_latex.recover_special(latex_translated) + latex_translated = process_latex.recover_accent(latex_translated) + self.close() return latex_translated diff --git a/test/fmt1_google.tex b/test/fmt1_google.tex index 03d4a3b..e797dce 100644 --- a/test/fmt1_google.tex +++ b/test/fmt1_google.tex @@ -6,7 +6,7 @@ 因为当 \( D=0 \) 时 DS 方程是代数的,我们可以分析地推导出这种渐近行为:我们代入 \( G_{2 n}=(-1)^{n+1}(2 n-1) ! g_{2 n} \) ,将 \( 2 n \) th DS 方程乘以 \( x^{2 n} \) ,从 \( n=1 \) 到 \( \infty \) 求和,并定义生成函数 \( u(x) \equiv x g_{2}+x^{3} g_{4}+x^{5} g_{6}+\cdots \) 。 \( u(x) \) 满足的微分方程是非线性的: \[u^{\prime \prime}(x)=3 u^{\prime}(x) u(x)-u^{3}(x)-x \] 其中 \( u(0)=0 \) 和 \( u^{\prime}(0)=G_{2} \) 。我们通过代入 \( u(x)=-y^{\prime}(x) / y(x) \) 线性化 (5) 并得到 \( y^{\prime \prime \prime}(x)=x y(x) \) ,其中 \( y(0)=1, y^{\prime}(0)=0, y^{\prime \prime}(0)=-G_{2} \) 。满足这些初始条件的精确解是 \[y(x)=\frac{2 \sqrt{2}}{\Gamma(1 / 4)} \int_{0}^{\infty} d t \cos (x t) e^{-t^{4} / 4} . -\] 如果 \( y(x)=0 \) ,则生成函数 \( u(x) \) 变为无穷大,因此 \( |x| \) 的最小值 \( y(x)=0 \) 是 \( u(x) \) 级数的收敛半径。一个简单的图显示 \( y(x) \) 在 \( x_{0}= \pm 2.4419682 \ldots \) [9] 处消失。因此, \( r=1 / x_{0}=0.409506 \ldots \) 证实了 (4)。 +\] 如果 \( y(x)=0 \) ,生成函数 \( u(x) \) 变为无穷大,因此 \( |x| \) 的最小值 \( y(x)=0 \) 是 \( u(x) \) 级数的收敛半径。一个简单的图显示 \( y(x) \) 在 \( x_{0}= \pm 2.4419682 \ldots \) [9] 处消失。因此, \( r=1 / x_{0}=0.409506 \ldots \) 证实了 (4)。 (4) 中的渐近行为表明 \( G_{2 n} \) 比 \( \gamma_{2 n} \) 增长得更快,因为 \( n \rightarrow \infty \) : \[ \gamma_{2 n}=\frac{\int_{-\infty}^{\infty} d x x^{2 n} e^{-x^{4} / 4}}{\int_{-\infty}^{\infty} d x e^{-x^{4} / 4}} \sim 2^{n} \frac{\Gamma(n / 2+1 / 4)}{\Gamma(1 / 4)} . diff --git a/test/fmt2_google.tex b/test/fmt2_google.tex index ebc166a..a8d4560 100644 --- a/test/fmt2_google.tex +++ b/test/fmt2_google.tex @@ -6,7 +6,7 @@ 因为当 $D=0$ 时 DS 方程是代数的,我们可以分析地推导出这种渐近行为:我们代入 $G_{2 n}=(-1)^{n+1}(2 n-1) ! g_{2 n}$ ,将 $2 n$ th DS 方程乘以 $x^{2 n}$ ,从 $n=1$ 到 $\infty$ 求和,并定义生成函数 $u(x) \equiv x g_2+x^3 g_4+x^5 g_6+\cdots$ 。 $u(x)$ 满足的微分方程是非线性的: $$u^{\prime \prime}(x)=3 u^{\prime}(x) u(x)-u^3(x)-x $$ 其中 $u(0)=0$ 和 $u^{\prime}(0)=G_2$ 。我们通过代入 $u(x)=-y^{\prime}(x) / y(x)$ 线性化 (5) 并得到 $y^{\prime \prime \prime}(x)=x y(x)$ ,其中 $y(0)=1, y^{\prime}(0)=0, y^{\prime \prime}(0)=-G_2$ 。满足这些初始条件的精确解是 $$y(x)=\frac{2 \sqrt{2}}{\Gamma(1 / 4)} \int_0^{\infty} d t \cos (x t) e^{-t^4 / 4} . -$$ 如果 $y(x)=0$ ,则生成函数 $u(x)$ 变为无穷大,因此 $|x|$ 的最小值 $y(x)=0$ 是 $u(x)$ 级数的收敛半径。一个简单的图显示 $y(x)$ 在 $x_0= \pm 2.4419682 \ldots$ 处消失。 [9].因此, $r=1 / x_0=0.409506 \ldots$ 证实了 (4)。 +$$ 如果 $y(x)=0$ ,生成函数 $u(x)$ 变为无穷大,因此 $|x|$ 的最小值 $y(x)=0$ 是 $u(x)$ 级数的收敛半径。一个简单的图显示 $y(x)$ 在 $x_0= \pm 2.4419682 \ldots$ 处消失。 [9].因此, $r=1 / x_0=0.409506 \ldots$ 证实了 (4)。 (4) 中的渐近行为表明 $G_{2 n}$ 比 $\gamma_{2 n}$ 增长得更快,因为 $n \rightarrow \infty$ : $$ \gamma_{2 n}=\frac{\int_{-\infty}^{\infty} d x x^{2 n} e^{-x^4 / 4}}{\int_{-\infty}^{\infty} d x e^{-x^4 / 4}} \sim 2^n \frac{\Gamma(n / 2+1 / 4)}{\Gamma(1 / 4)} . diff --git a/test/process.py b/test/process.py index 5311dc6..e01a8c8 100644 --- a/test/process.py +++ b/test/process.py @@ -1,43 +1,23 @@ import mathtranslate +mathtranslate.process_latex.test_environment = False +import mathtranslate.process_latex as pl import re -old = r""" -This is a paragraph with some L_a_T_e_X environments: -\begin{enumerate} -\item First item -\item Second item -\end{enumerate} -And some math: $E=mc^2$ and $$\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2}$$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -""" -new = r""" -THIS IS A PARAGRAPH WITH SOME L\_A\_T\_E\_X ENVIRONMENTS: -\begin{enumerate} -\item First item -\item Second item -\end{enumerate} -AND SOME MATH: $E=mc^2$ AND $$\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2}$$ -??? -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -$ H \psi = E \psi$ -""" -text1, eqs = mathtranslate.process_latex.replace_latex_envs(old) +old = open("./old.txt").read() +new = open("./new.txt").read() +text1, eqs = mathtranslate.process_latex.replace_latex_objects(old) text2 = re.sub(r'XMATHX_2(?![\d_])', 'XMATHX_2_2', text1) text3 = text2.upper() -text4 = mathtranslate.process_latex.recover_latex_envs(text3, eqs) +text4 = mathtranslate.process_latex.recover_latex_objects(text3, eqs) assert text4 == new + +old_special = r'\\ \ \& \%' +new_special = r' \\ \ \& \% ' +intermediate = pl.replace_special(old_special) +assert intermediate.count(mathtranslate.config.math_code) == 4 +assert pl.recover_special(intermediate) == new_special + +old_accent = r'\^{o} \^o \"{o} \"o' +new_accent = r'\^{o} \^{o} \"{o} \"{o}' +intermediate = pl.replace_accent(old_accent) +assert intermediate.count(mathtranslate.config.math_code) == 4 +assert pl.recover_accent(pl.replace_accent(old_accent)) == new_accent