Skip to content

Commit

Permalink
deploy: a3c8310
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool committed Jan 5, 2024
1 parent dbca01d commit dbb7756
Show file tree
Hide file tree
Showing 110 changed files with 2,599 additions and 1,468 deletions.
2 changes: 1 addition & 1 deletion .buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: d12e13dcdf1e8a0872e01cb8a7af3cff
config: 067252fbe52d6e8b25f23a199d3d796f
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file modified .doctrees/data_juicer.analysis.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.config.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.core.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.format.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.common.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.deduplicator.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.filter.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.mapper.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.ops.selector.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.tools.doctree
Binary file not shown.
Binary file modified .doctrees/data_juicer.utils.doctree
Binary file not shown.
Binary file modified .doctrees/environment.pickle
Binary file not shown.
Binary file modified .doctrees/index.doctree
Binary file not shown.
Binary file modified .doctrees/modules.doctree
Binary file not shown.
40 changes: 26 additions & 14 deletions _modules/data_juicer/analysis/column_wise_analysis.html
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<html class="writer-html5" lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>data_juicer.analysis.column_wise_analysis &mdash; data_juicer 0.1.2 documentation</title>
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
<title>data_juicer.analysis.column_wise_analysis &mdash; data_juicer 0.1.3 documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=19f00094" />


<!--[if lt IE 9]>
<script src="../../../_static/js/html5shiv.min.js"></script>
<![endif]-->

<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js?v=b0099a1c"></script>
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js?v=14711e05"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=4825356b"></script>
<script src="../../../_static/js/theme.js"></script>
Expand All @@ -30,7 +32,7 @@
data_juicer
</a>
<div class="version">
0.1.2
0.1.3
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
Expand Down Expand Up @@ -74,6 +76,7 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi

<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">tqdm</span> <span class="kn">import</span> <span class="n">tqdm</span>

<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span>

Expand Down Expand Up @@ -150,14 +153,15 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi

<span class="bp">self</span><span class="o">.</span><span class="n">save_stats_in_one_file</span> <span class="o">=</span> <span class="n">save_stats_in_one_file</span></div>

<div class="viewcode-block" id="ColumnWiseAnalysis.analyse"><a class="viewcode-back" href="../../../data_juicer.analysis.html#data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.analyse">[docs]</a> <span class="k">def</span> <span class="nf">analyse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">show_percentiles</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">show</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<div class="viewcode-block" id="ColumnWiseAnalysis.analyse"><a class="viewcode-back" href="../../../data_juicer.analysis.html#data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.analyse">[docs]</a> <span class="k">def</span> <span class="nf">analyse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">show_percentiles</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">show</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">skip_export</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply analysis and draw the analysis figure for stats.</span>

<span class="sd"> :param show_percentiles: whether to show the percentile line in</span>
<span class="sd"> each sub-figure. If it&#39;s true, there will be several red</span>
<span class="sd"> lines to indicate the quantiles of the stats distributions</span>
<span class="sd"> :param show: whether to show in a single window after drawing</span>
<span class="sd"> :param skip_export: whether save the results into disk</span>
<span class="sd"> :return:</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># number of sub-figures for each stat. There are histogram and box plot</span>
Expand All @@ -182,8 +186,11 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi
<span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">(</span><span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="n">rec_width</span><span class="p">,</span> <span class="n">rec_height</span><span class="p">),</span>
<span class="n">layout</span><span class="o">=</span><span class="s1">&#39;constrained&#39;</span><span class="p">)</span>
<span class="n">subfigs</span> <span class="o">=</span> <span class="n">fig</span><span class="o">.</span><span class="n">subfigures</span><span class="p">(</span><span class="n">rec_row</span><span class="p">,</span> <span class="n">rec_col</span><span class="p">,</span> <span class="n">wspace</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">column_name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">column_name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">tqdm</span><span class="p">(</span><span class="n">columns</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
<span class="n">desc</span><span class="o">=</span><span class="s1">&#39;Column&#39;</span><span class="p">)):</span>
<span class="n">data</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats</span><span class="p">[</span><span class="n">column_name</span><span class="p">]</span>
<span class="c1"># explode data to flatten inner list</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">explode</span><span class="p">()</span><span class="o">.</span><span class="n">infer_objects</span><span class="p">()</span>
<span class="n">grid</span> <span class="o">=</span> <span class="n">grid_indexes</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_stats_in_one_file</span><span class="p">:</span>
<span class="k">if</span> <span class="n">rec_col</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
Expand All @@ -199,7 +206,8 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi

<span class="c1"># numeric or string via nan. Apply different plot method for them.</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">overall_result</span><span class="p">[</span><span class="n">column_name</span><span class="p">]</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;top&#39;</span><span class="p">)):</span>
<span class="c1"># numeric -- draw histogram and box plot for this stat</span>
<span class="c1"># numeric or numeric list -- draw histogram and box plot for</span>
<span class="c1"># this stat</span>
<span class="n">percentiles</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">overall_result</span><span class="p">[</span><span class="n">column_name</span><span class="p">]</span> \
<span class="k">if</span> <span class="n">show_percentiles</span> <span class="k">else</span> <span class="kc">None</span>

Expand All @@ -223,15 +231,18 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi
<span class="sa">f</span><span class="s1">&#39;</span><span class="si">{</span><span class="n">column_name</span><span class="si">}</span><span class="s1">-box.png&#39;</span><span class="p">),</span>
<span class="n">percentiles</span><span class="o">=</span><span class="n">percentiles</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># object (string) -- only draw histogram for this stat</span>
<span class="c1"># object (string) or string list -- only draw histogram for</span>
<span class="c1"># this stat</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_stats_in_one_file</span><span class="p">:</span>
<span class="n">axes</span> <span class="o">=</span> <span class="n">subfig</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">axes</span> <span class="o">=</span> <span class="kc">None</span>

<span class="bp">self</span><span class="o">.</span><span class="n">draw_hist</span><span class="p">(</span>
<span class="n">axes</span><span class="p">,</span> <span class="n">data</span><span class="p">,</span>
<span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_path</span><span class="p">,</span> <span class="sa">f</span><span class="s1">&#39;</span><span class="si">{</span><span class="n">column_name</span><span class="si">}</span><span class="s1">-hist.png&#39;</span><span class="p">))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">skip_export</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">draw_hist</span><span class="p">(</span>
<span class="n">axes</span><span class="p">,</span> <span class="n">data</span><span class="p">,</span>
<span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_path</span><span class="p">,</span>
<span class="sa">f</span><span class="s1">&#39;</span><span class="si">{</span><span class="n">column_name</span><span class="si">}</span><span class="s1">-hist.png&#39;</span><span class="p">))</span>

<span class="c1"># add a title to the figure of this stat</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_stats_in_one_file</span><span class="p">:</span>
Expand All @@ -241,7 +252,8 @@ <h1>Source code for data_juicer.analysis.column_wise_analysis</h1><div class="hi

<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_stats_in_one_file</span><span class="p">:</span>
<span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">gcf</span><span class="p">()</span>
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_path</span><span class="p">,</span> <span class="s1">&#39;all-stats.png&#39;</span><span class="p">))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">skip_export</span><span class="p">:</span>
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_path</span><span class="p">,</span> <span class="s1">&#39;all-stats.png&#39;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">show</span><span class="p">:</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
Expand Down
14 changes: 8 additions & 6 deletions _modules/data_juicer/analysis/diversity_analysis.html
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<html class="writer-html5" lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>data_juicer.analysis.diversity_analysis &mdash; data_juicer 0.1.2 documentation</title>
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
<title>data_juicer.analysis.diversity_analysis &mdash; data_juicer 0.1.3 documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=19f00094" />


<!--[if lt IE 9]>
<script src="../../../_static/js/html5shiv.min.js"></script>
<![endif]-->

<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js?v=b0099a1c"></script>
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js?v=14711e05"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=4825356b"></script>
<script src="../../../_static/js/theme.js"></script>
Expand All @@ -30,7 +32,7 @@
data_juicer
</a>
<div class="version">
0.1.2
0.1.3
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
Expand Down
Loading

0 comments on commit dbb7756

Please sign in to comment.