Skip to content

Commit

Permalink
Merge pull request #22 from SUSYUSTC/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
SUSYUSTC authored Mar 30, 2023
2 parents 4d1dad0 + f10ea80 commit 105d205
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 64 deletions.
2 changes: 1 addition & 1 deletion mathtranslate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "2.1.3"
__version__ = "2.1.4"
__author__ = "Jiace Sun"

import os
Expand Down
6 changes: 6 additions & 0 deletions mathtranslate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,9 @@ def reread():
tencent_secret_key = read_variable(tencent_secret_key_path, tencent_secret_key_default)

math_code = 'XMATHX'

if os.path.exists(f'{ROOT}/TEST'):
test_environment = True
print('This is a test environment!')
else:
test_environment = False
99 changes: 91 additions & 8 deletions mathtranslate/process_latex.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,40 @@
import re
import regex
from .config import math_code
from .config import math_code, test_environment

match_code = r"(" + math_code + r"_\d+(?:_\d+)*)"
match_code_replace = math_code + r"_(\d+(?:_\d+)*)*"
pattern_env = r"\\begin\{(.*?)\}(.*?)\\end\{\1\}" # \begin{xxx} \end{xxx}, group 1: name, group 2: content

pattern_env = r"\\begin\{(.*?)\}(\[[a-zA-Z\s,]*?\])?(.*?)\\end\{\1\}" # \begin{xxx} \end{xxx}, group 1: name, group 2: option, group 3: content
pattern_command_full = r"\\([a-zA-Z]+\*?)(\[[a-zA-Z\s,]*?\])?(\{((?:[^{}]++|(?3))++)\})" # \xxx[xxx]{xxx} and \xxx{xxx}, group 1: name, group 2: option, group 4: content
pattern_command_simple = r"\\([a-zA-Z]+)" # \xxx, group 1: name
pattern_brace = r"\{((?:[^{}]++|(?0))++)\}" # {xxx}, group 1: content
pattern_theorem = r"\\newtheorem\{(.+?)\}" # \newtheorem{xxx}, group 1: name
pattern_accent = r"\\([`'\"^~=.])(?:\{([a-zA-Z])\}|([a-zA-Z]))" # match special characters with accents, group 1: accent, group 2/3: normal character
match_code_accent = rf'{math_code}([A-Z]{{2}})([a-zA-Z])' # group 1: accent name, group 2: normal character
list_special = ['\\', '%', '&', '#', '$', '{', '}', ' '] # all special characters in form of \x

special_character_forward = {
'\\': 'BS',
'%': 'PC',
'&': 'AD',
'#': 'NB',
'$': 'DL',
'{': 'LB',
'}': 'RB',
'^': 'UT',
' ': 'SP',
'`': 'BQ',
'~': 'TD',
"'": 'SQ',
'"': 'DQ',
'=': 'EQ',
'.': 'DT',
'*': 'ST',
'@': 'AT',
}
special_character_backward = {special_character_forward[key]: key for key in special_character_forward}
assert len(set(special_character_forward.values())) == len(special_character_forward)


def variable_code(count):
Expand Down Expand Up @@ -44,8 +71,8 @@ def modify_after(text):

def replace_latex_objects(text):
r"""
Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts),
Replaces all LaTeX objects in a given text with the format "{math_code}_{digit1}_{digit2}_..._{digit_last}",
applies a given function to the resulting text (excluding the "{math_code}_{digit1}_{digit2}_..._{digit_last}" parts),
and returns both the processed text and a list of replaced LaTeX objects.
Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
$ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx.
Expand All @@ -64,7 +91,7 @@ def replace_latex_objects(text):
pattern_brace, # {xxx}
]

# iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
# iterate through each LaTeX object and replace with "{math_code}_{digit1}_{digit2}_..._{digit_last}"
count = 0
replaced_objs = []
for regex_symbol in latex_obj_regex:
Expand All @@ -89,7 +116,8 @@ def get_obj(digit_str):
if index < nobjs:
return replaced_objs[index]
else:
#assert final
if test_environment:
assert final
return '???'

text = modify_text(text, modify_after)
Expand Down Expand Up @@ -143,10 +171,13 @@ def process_specific_env(latex, function, env_names):
def process_function(match):
# \begin{env_name} content \end{env_name}
env_name = match.group(1)
content = match.group(2)
options = match.group(2)
if options is None:
options = ''
content = match.group(3)
if env_name in env_names:
processed_content = function(content)
return rf'\begin{{{env_name}}}{processed_content}\end{{{env_name}}}'
return rf'\begin{{{env_name}}}{options}{processed_content}\end{{{env_name}}}'
else:
return match.group(0)
return pattern.sub(process_function, latex)
Expand Down Expand Up @@ -214,3 +245,55 @@ def is_complete(latex_code):
return False

return True


def get_theorems(text):
pattern = re.compile(pattern_theorem, re.DOTALL)
matches = re.finditer(pattern, text)
theorems = [match.group(1) for match in matches]
return theorems


def replace_special(text):
for special in list_special:
# add space around
text = text.replace(f'\\{special}', f' {math_code}{special_character_forward[special]} ')

return text


def recover_special(text):
for special in list_special:
text = text.replace(math_code + special_character_forward[special], f'\\{special}')

return text


def replace_accent(text):
def replace_function(match):
special = match.group(1)
char1 = match.group(2)
char2 = match.group(3)
if char1 is None:
assert char2 is not None
char = char2
else:
assert char2 is None
char = char1
# do not add space around
return math_code + special_character_forward[special] + f'{char}'

text = re.compile(pattern_accent).sub(replace_function, text)

return text


def recover_accent(text):
def replace_function(match):
special = special_character_backward[match.group(1)]
char = match.group(2)
return rf'\{special}{{{char}}}'

text = re.compile(match_code_accent).sub(replace_function, text)

return text
26 changes: 11 additions & 15 deletions mathtranslate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,12 @@ def translate_latex_commands(self, latex_original, names, complete):

def translate_full_latex(self, latex_original, loadmain=False):
latex_original = process_latex.remove_tex_comments(latex_original)

latex_original = process_latex.replace_accent(latex_original)
latex_original = process_latex.replace_special(latex_original)

complete = process_latex.is_complete(latex_original)
theorems = process_latex.get_theorems(latex_original)
if complete:
print('It is a full latex document')
latex_original, tex_begin, tex_end = process_latex.split_latex_document(latex_original, r'\begin{document}', r'\end{document}')
Expand All @@ -163,13 +168,6 @@ def translate_full_latex(self, latex_original, loadmain=False):
tex_begin = default_begin
tex_end = default_end

# It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back
latex_original = latex_original.replace(r'\{', f'{math_code}LB')
latex_original = latex_original.replace(r'\}', f'{math_code}RB')
# The following two can probably be put somewhere else
latex_original = latex_original.replace(r'\%', f'{math_code}PC')
latex_original = latex_original.replace(r'\ ', f'{math_code}SP')

if loadmain:
latex_translated = open('text_after_main.txt').read()
else:
Expand All @@ -183,31 +181,29 @@ def translate_full_latex(self, latex_original, loadmain=False):
latex_translated_paragraphs.append(latex_translated_paragraph)
print(num, '/', len(latex_original_paragraphs))
num += 1

latex_translated = '\n\n'.join(latex_translated_paragraphs)

if self.debug:
print(latex_translated, file=open('text_after_main.txt', 'w'))

# TODO: add more here
environment_list = ['abstract', 'acknowledgments', 'itemize', 'enumerate', 'description', 'list']
# addition: ['theorem', 'proposition', 'conjecture', 'lemma', 'claim', 'fact', 'corollary', 'remark', 'definition', 'example', 'proof']
environment_list = ['abstract', 'acknowledgments', 'itemize', 'enumerate', 'description', 'list', 'proof']
print('processing latex environments')
latex_translated = self.translate_latex_env(latex_translated, environment_list, complete)
latex_translated = self.translate_latex_env(latex_translated, environment_list + theorems, complete)

command_list = ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote', 'paragraph']
print('processing latex commands')
latex_translated = self.translate_latex_commands(latex_translated, command_list, complete)

latex_translated = latex_translated.replace(f'{math_code}LB', r'\{')
latex_translated = latex_translated.replace(f'{math_code}RB', r'\}')
latex_translated = latex_translated.replace(f'{math_code}PC', r'\%')
latex_translated = latex_translated.replace(f'{math_code}SP', r'\ ')

latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end

# Title is probably outside the body part
print('processing title')
latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)

latex_translated = process_latex.recover_special(latex_translated)
latex_translated = process_latex.recover_accent(latex_translated)

self.close()
return latex_translated
2 changes: 1 addition & 1 deletion test/fmt1_google.tex
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

因为当 \( D=0 \) 时 DS 方程是代数的,我们可以分析地推导出这种渐近行为:我们代入 \( G_{2 n}=(-1)^{n+1}(2 n-1) ! g_{2 n} \) ,将 \( 2 n \) th DS 方程乘以 \( x^{2 n} \) ,从 \( n=1 \)\( \infty \) 求和,并定义生成函数 \( u(x) \equiv x g_{2}+x^{3} g_{4}+x^{5} g_{6}+\cdots \)\( u(x) \) 满足的微分方程是非线性的: \[u^{\prime \prime}(x)=3 u^{\prime}(x) u(x)-u^{3}(x)-x
\] 其中 \( u(0)=0 \)\( u^{\prime}(0)=G_{2} \) 。我们通过代入 \( u(x)=-y^{\prime}(x) / y(x) \) 线性化 (5) 并得到 \( y^{\prime \prime \prime}(x)=x y(x) \) ,其中 \( y(0)=1, y^{\prime}(0)=0, y^{\prime \prime}(0)=-G_{2} \) 。满足这些初始条件的精确解是 \[y(x)=\frac{2 \sqrt{2}}{\Gamma(1 / 4)} \int_{0}^{\infty} d t \cos (x t) e^{-t^{4} / 4} .
\] 如果 \( y(x)=0 \)则生成函数 \( u(x) \) 变为无穷大,因此 \( |x| \) 的最小值 \( y(x)=0 \)\( u(x) \) 级数的收敛半径。一个简单的图显示 \( y(x) \)\( x_{0}= \pm 2.4419682 \ldots \) [9] 处消失。因此, \( r=1 / x_{0}=0.409506 \ldots \) 证实了 (4)。
\] 如果 \( y(x)=0 \)生成函数 \( u(x) \) 变为无穷大,因此 \( |x| \) 的最小值 \( y(x)=0 \)\( u(x) \) 级数的收敛半径。一个简单的图显示 \( y(x) \)\( x_{0}= \pm 2.4419682 \ldots \) [9] 处消失。因此, \( r=1 / x_{0}=0.409506 \ldots \) 证实了 (4)。

(4) 中的渐近行为表明 \( G_{2 n} \)\( \gamma_{2 n} \) 增长得更快,因为 \( n \rightarrow \infty \) : \[
\gamma_{2 n}=\frac{\int_{-\infty}^{\infty} d x x^{2 n} e^{-x^{4} / 4}}{\int_{-\infty}^{\infty} d x e^{-x^{4} / 4}} \sim 2^{n} \frac{\Gamma(n / 2+1 / 4)}{\Gamma(1 / 4)} .
Expand Down
2 changes: 1 addition & 1 deletion test/fmt2_google.tex
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

因为当 $D=0$ 时 DS 方程是代数的,我们可以分析地推导出这种渐近行为:我们代入 $G_{2 n}=(-1)^{n+1}(2 n-1) ! g_{2 n}$ ,将 $2 n$ th DS 方程乘以 $x^{2 n}$ ,从 $n=1$$\infty$ 求和,并定义生成函数 $u(x) \equiv x g_2+x^3 g_4+x^5 g_6+\cdots$$u(x)$ 满足的微分方程是非线性的: $$u^{\prime \prime}(x)=3 u^{\prime}(x) u(x)-u^3(x)-x
$$ 其中 $u(0)=0$$u^{\prime}(0)=G_2$ 。我们通过代入 $u(x)=-y^{\prime}(x) / y(x)$ 线性化 (5) 并得到 $y^{\prime \prime \prime}(x)=x y(x)$ ,其中 $y(0)=1, y^{\prime}(0)=0, y^{\prime \prime}(0)=-G_2$ 。满足这些初始条件的精确解是 $$y(x)=\frac{2 \sqrt{2}}{\Gamma(1 / 4)} \int_0^{\infty} d t \cos (x t) e^{-t^4 / 4} .
$$ 如果 $y(x)=0$则生成函数 $u(x)$ 变为无穷大,因此 $|x|$ 的最小值 $y(x)=0$$u(x)$ 级数的收敛半径。一个简单的图显示 $y(x)$$x_0= \pm 2.4419682 \ldots$ 处消失。 [9].因此, $r=1 / x_0=0.409506 \ldots$ 证实了 (4)。
$$ 如果 $y(x)=0$生成函数 $u(x)$ 变为无穷大,因此 $|x|$ 的最小值 $y(x)=0$$u(x)$ 级数的收敛半径。一个简单的图显示 $y(x)$$x_0= \pm 2.4419682 \ldots$ 处消失。 [9].因此, $r=1 / x_0=0.409506 \ldots$ 证实了 (4)。

(4) 中的渐近行为表明 $G_{2 n}$$\gamma_{2 n}$ 增长得更快,因为 $n \rightarrow \infty$ : $$
\gamma_{2 n}=\frac{\int_{-\infty}^{\infty} d x x^{2 n} e^{-x^4 / 4}}{\int_{-\infty}^{\infty} d x e^{-x^4 / 4}} \sim 2^n \frac{\Gamma(n / 2+1 / 4)}{\Gamma(1 / 4)} .
Expand Down
56 changes: 18 additions & 38 deletions test/process.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,23 @@
import mathtranslate
mathtranslate.process_latex.test_environment = False
import mathtranslate.process_latex as pl
import re
old = r"""
This is a paragraph with some L_a_T_e_X environments:
\begin{enumerate}
\item First item
\item Second item
\end{enumerate}
And some math: $E=mc^2$ and $$\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2}$$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
"""
new = r"""
THIS IS A PARAGRAPH WITH SOME L\_A\_T\_E\_X ENVIRONMENTS:
\begin{enumerate}
\item First item
\item Second item
\end{enumerate}
AND SOME MATH: $E=mc^2$ AND $$\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2}$$
???
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
$ H \psi = E \psi$
"""
text1, eqs = mathtranslate.process_latex.replace_latex_envs(old)
old = open("./old.txt").read()
new = open("./new.txt").read()
text1, eqs = mathtranslate.process_latex.replace_latex_objects(old)
text2 = re.sub(r'XMATHX_2(?![\d_])', 'XMATHX_2_2', text1)
text3 = text2.upper()
text4 = mathtranslate.process_latex.recover_latex_envs(text3, eqs)
text4 = mathtranslate.process_latex.recover_latex_objects(text3, eqs)
assert text4 == new

old_special = r'\\ \ \& \%'
new_special = r' \\ \ \& \% '
intermediate = pl.replace_special(old_special)
assert intermediate.count(mathtranslate.config.math_code) == 4
assert pl.recover_special(intermediate) == new_special

old_accent = r'\^{o} \^o \"{o} \"o'
new_accent = r'\^{o} \^{o} \"{o} \"{o}'
intermediate = pl.replace_accent(old_accent)
assert intermediate.count(mathtranslate.config.math_code) == 4
assert pl.recover_accent(pl.replace_accent(old_accent)) == new_accent

0 comments on commit 105d205

Please sign in to comment.