From d2507f1cbd47126d41b335760066c1ba552ae505 Mon Sep 17 00:00:00 2001 From: "Documenter.jl" Date: Sat, 2 Mar 2024 21:44:43 +0000 Subject: [PATCH] build based on ba1f4d2 --- dev/.documenter-siteinfo.json | 2 +- dev/assets/documenter.js | 923 +++++++++++++++----------- dev/assets/themes/documenter-dark.css | 2 +- dev/examples.html | 2 +- dev/index.html | 2 +- dev/objects.inv | Bin 0 -> 1251 bytes dev/reading.html | 6 +- dev/writing.html | 2 +- 8 files changed, 550 insertions(+), 389 deletions(-) create mode 100644 dev/objects.inv diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index fa11cf10..2f1f6c53 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.1","generation_timestamp":"2024-02-28T04:23:59","documenter_version":"1.2.1"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.1","generation_timestamp":"2024-03-02T21:44:37","documenter_version":"1.3.0"}} \ No newline at end of file diff --git a/dev/assets/documenter.js b/dev/assets/documenter.js index f5311607..c6562b55 100644 --- a/dev/assets/documenter.js +++ b/dev/assets/documenter.js @@ -4,7 +4,6 @@ requirejs.config({ 'highlight-julia': 'https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/languages/julia.min', 'headroom': 'https://cdnjs.cloudflare.com/ajax/libs/headroom/0.12.0/headroom.min', 'jqueryui': 'https://cdnjs.cloudflare.com/ajax/libs/jqueryui/1.13.2/jquery-ui.min', - 'minisearch': 'https://cdn.jsdelivr.net/npm/minisearch@6.1.0/dist/umd/index.min', 'katex-auto-render': 'https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/contrib/auto-render.min', 'jquery': 'https://cdnjs.cloudflare.com/ajax/libs/jquery/3.7.0/jquery.min', 'headroom-jquery': 'https://cdnjs.cloudflare.com/ajax/libs/headroom/0.12.0/jQuery.headroom.min', @@ -103,9 +102,10 @@ $(document).on("click", ".docstring header", function () { }); }); -$(document).on("click", ".docs-article-toggle-button", function () { +$(document).on("click", ".docs-article-toggle-button", function (event) { let articleToggleTitle = "Expand docstring"; let navArticleToggleTitle = "Expand all docstrings"; + let animationSpeed = event.noToggleAnimation ? 0 : 400; debounce(() => { if (isExpanded) { @@ -116,7 +116,7 @@ $(document).on("click", ".docs-article-toggle-button", function () { isExpanded = false; - $(".docstring section").slideUp(); + $(".docstring section").slideUp(animationSpeed); } else { $(this).removeClass("fa-chevron-down").addClass("fa-chevron-up"); $(".docstring-article-toggle-button") @@ -127,7 +127,7 @@ $(document).on("click", ".docs-article-toggle-button", function () { articleToggleTitle = "Collapse docstring"; navArticleToggleTitle = "Collapse all docstrings"; - $(".docstring section").slideDown(); + $(".docstring section").slideDown(animationSpeed); } $(this).prop("title", navArticleToggleTitle); @@ -224,224 +224,465 @@ $(document).ready(function () { }) //////////////////////////////////////////////////////////////////////////////// -require(['jquery', 'minisearch'], function($, minisearch) { - -// In general, most search related things will have "search" as a prefix. -// To get an in-depth about the thought process you can refer: https://hetarth02.hashnode.dev/series/gsoc +require(['jquery'], function($) { -let results = []; -let timer = undefined; +$(document).ready(function () { + let meta = $("div[data-docstringscollapsed]").data(); -let data = documenterSearchIndex["docs"].map((x, key) => { - x["id"] = key; // minisearch requires a unique for each object - return x; + if (meta?.docstringscollapsed) { + $("#documenter-article-toggle-button").trigger({ + type: "click", + noToggleAnimation: true, + }); + } }); -// list below is the lunr 2.1.3 list minus the intersect with names(Base) -// (all, any, get, in, is, only, which) and (do, else, for, let, where, while, with) -// ideally we'd just filter the original list but it's not available as a variable -const stopWords = new Set([ - "a", - "able", - "about", - "across", - "after", - "almost", - "also", - "am", - "among", - "an", - "and", - "are", - "as", - "at", - "be", - "because", - "been", - "but", - "by", - "can", - "cannot", - "could", - "dear", - "did", - "does", - "either", - "ever", - "every", - "from", - "got", - "had", - "has", - "have", - "he", - "her", - "hers", - "him", - "his", - "how", - "however", - "i", - "if", - "into", - "it", - "its", - "just", - "least", - "like", - "likely", - "may", - "me", - "might", - "most", - "must", - "my", - "neither", - "no", - "nor", - "not", - "of", - "off", - "often", - "on", - "or", - "other", - "our", - "own", - "rather", - "said", - "say", - "says", - "she", - "should", - "since", - "so", - "some", - "than", - "that", - "the", - "their", - "them", - "then", - "there", - "these", - "they", - "this", - "tis", - "to", - "too", - "twas", - "us", - "wants", - "was", - "we", - "were", - "what", - "when", - "who", - "whom", - "why", - "will", - "would", - "yet", - "you", - "your", -]); - -let index = new minisearch({ - fields: ["title", "text"], // fields to index for full-text search - storeFields: ["location", "title", "text", "category", "page"], // fields to return with search results - processTerm: (term) => { - let word = stopWords.has(term) ? null : term; - if (word) { - // custom trimmer that doesn't strip @ and !, which are used in julia macro and function names - word = word - .replace(/^[^a-zA-Z0-9@!]+/, "") - .replace(/[^a-zA-Z0-9@!]+$/, ""); - } +}) +//////////////////////////////////////////////////////////////////////////////// +require(['jquery'], function($) { - return word ?? null; - }, - // add . as a separator, because otherwise "title": "Documenter.Anchors.add!", would not find anything if searching for "add!", only for the entire qualification - tokenize: (string) => string.split(/[\s\-\.]+/), - // options which will be applied during the search - searchOptions: { - boost: { title: 100 }, - fuzzy: 2, +/* +To get an in-depth about the thought process you can refer: https://hetarth02.hashnode.dev/series/gsoc + +PSEUDOCODE: + +Searching happens automatically as the user types or adjusts the selected filters. +To preserve responsiveness, as much as possible of the slow parts of the search are done +in a web worker. Searching and result generation are done in the worker, and filtering and +DOM updates are done in the main thread. The filters are in the main thread as they should +be very quick to apply. This lets filters be changed without re-searching with minisearch +(which is possible even if filtering is on the worker thread) and also lets filters be +changed _while_ the worker is searching and without message passing (neither of which are +possible if filtering is on the worker thread) + +SEARCH WORKER: + +Import minisearch + +Build index + +On message from main thread + run search + find the first 200 unique results from each category, and compute their divs for display + note that this is necessary and sufficient information for the main thread to find the + first 200 unique results from any given filter set + post results to main thread + +MAIN: + +Launch worker + +Declare nonconstant globals (worker_is_running, last_search_text, unfiltered_results) + +On text update + if worker is not running, launch_search() + +launch_search + set worker_is_running to true, set last_search_text to the search text + post the search query to worker + +on message from worker + if last_search_text is not the same as the text in the search field, + the latest search result is not reflective of the latest search query, so update again + launch_search() + otherwise + set worker_is_running to false + + regardless, display the new search results to the user + save the unfiltered_results as a global + update_search() + +on filter click + adjust the filter selection + update_search() + +update_search + apply search filters by looping through the unfiltered_results and finding the first 200 + unique results that match the filters + + Update the DOM +*/ + +/////// SEARCH WORKER /////// + +function worker_function(documenterSearchIndex, documenterBaseURL, filters) { + importScripts( + "https://cdn.jsdelivr.net/npm/minisearch@6.1.0/dist/umd/index.min.js" + ); + + let data = documenterSearchIndex.map((x, key) => { + x["id"] = key; // minisearch requires a unique for each object + return x; + }); + + // list below is the lunr 2.1.3 list minus the intersect with names(Base) + // (all, any, get, in, is, only, which) and (do, else, for, let, where, while, with) + // ideally we'd just filter the original list but it's not available as a variable + const stopWords = new Set([ + "a", + "able", + "about", + "across", + "after", + "almost", + "also", + "am", + "among", + "an", + "and", + "are", + "as", + "at", + "be", + "because", + "been", + "but", + "by", + "can", + "cannot", + "could", + "dear", + "did", + "does", + "either", + "ever", + "every", + "from", + "got", + "had", + "has", + "have", + "he", + "her", + "hers", + "him", + "his", + "how", + "however", + "i", + "if", + "into", + "it", + "its", + "just", + "least", + "like", + "likely", + "may", + "me", + "might", + "most", + "must", + "my", + "neither", + "no", + "nor", + "not", + "of", + "off", + "often", + "on", + "or", + "other", + "our", + "own", + "rather", + "said", + "say", + "says", + "she", + "should", + "since", + "so", + "some", + "than", + "that", + "the", + "their", + "them", + "then", + "there", + "these", + "they", + "this", + "tis", + "to", + "too", + "twas", + "us", + "wants", + "was", + "we", + "were", + "what", + "when", + "who", + "whom", + "why", + "will", + "would", + "yet", + "you", + "your", + ]); + + let index = new MiniSearch({ + fields: ["title", "text"], // fields to index for full-text search + storeFields: ["location", "title", "text", "category", "page"], // fields to return with results processTerm: (term) => { let word = stopWords.has(term) ? null : term; if (word) { + // custom trimmer that doesn't strip @ and !, which are used in julia macro and function names word = word .replace(/^[^a-zA-Z0-9@!]+/, "") .replace(/[^a-zA-Z0-9@!]+$/, ""); + + word = word.toLowerCase(); } return word ?? null; }, + // add . as a separator, because otherwise "title": "Documenter.Anchors.add!", would not + // find anything if searching for "add!", only for the entire qualification tokenize: (string) => string.split(/[\s\-\.]+/), - }, -}); + // options which will be applied during the search + searchOptions: { + prefix: true, + boost: { title: 100 }, + fuzzy: 2, + }, + }); -index.addAll(data); + index.addAll(data); + + /** + * Used to map characters to HTML entities. + * Refer: https://github.com/lodash/lodash/blob/main/src/escape.ts + */ + const htmlEscapes = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'", + }; + + /** + * Used to match HTML entities and HTML characters. + * Refer: https://github.com/lodash/lodash/blob/main/src/escape.ts + */ + const reUnescapedHtml = /[&<>"']/g; + const reHasUnescapedHtml = RegExp(reUnescapedHtml.source); + + /** + * Escape function from lodash + * Refer: https://github.com/lodash/lodash/blob/main/src/escape.ts + */ + function escape(string) { + return string && reHasUnescapedHtml.test(string) + ? string.replace(reUnescapedHtml, (chr) => htmlEscapes[chr]) + : string || ""; + } -let filters = [...new Set(data.map((x) => x.category))]; -var modal_filters = make_modal_body_filters(filters); -var filter_results = []; + /** + * Make the result component given a minisearch result data object and the value + * of the search input as queryString. To view the result object structure, refer: + * https://lucaong.github.io/minisearch/modules/_minisearch_.html#searchresult + * + * @param {object} result + * @param {string} querystring + * @returns string + */ + function make_search_result(result, querystring) { + let search_divider = `
`; + let display_link = + result.location.slice(Math.max(0), Math.min(50, result.location.length)) + + (result.location.length > 30 ? "..." : ""); // To cut-off the link because it messes with the overflow of the whole div + + if (result.page !== "") { + display_link += ` (${result.page})`; + } -$(document).on("keyup", ".documenter-search-input", function (event) { - // Adding a debounce to prevent disruptions from super-speed typing! - debounce(() => update_search(filter_results), 300); + let textindex = new RegExp(`${querystring}`, "i").exec(result.text); + let text = + textindex !== null + ? result.text.slice( + Math.max(textindex.index - 100, 0), + Math.min( + textindex.index + querystring.length + 100, + result.text.length + ) + ) + : ""; // cut-off text before and after from the match + + text = text.length ? escape(text) : ""; + + let display_result = text.length + ? "..." + + text.replace( + new RegExp(`${escape(querystring)}`, "i"), // For first occurrence + '$&' + ) + + "..." + : ""; // highlights the match + + let in_code = false; + if (!["page", "section"].includes(result.category.toLowerCase())) { + in_code = true; + } + + // We encode the full url to escape some special characters which can lead to broken links + let result_div = ` + +
+
${escape(result.title)}
+
${result.category}
+
+

+ ${display_result} +

+
+ ${display_link} +
+
+ ${search_divider} + `; + + return result_div; + } + + self.onmessage = function (e) { + let query = e.data; + let results = index.search(query, { + filter: (result) => { + // Only return relevant results + return result.score >= 1; + }, + }); + + // Pre-filter to deduplicate and limit to 200 per category to the extent + // possible without knowing what the filters are. + let filtered_results = []; + let counts = {}; + for (let filter of filters) { + counts[filter] = 0; + } + let present = {}; + + for (let result of results) { + cat = result.category; + cnt = counts[cat]; + if (cnt < 200) { + id = cat + "---" + result.location; + if (present[id]) { + continue; + } + present[id] = true; + filtered_results.push({ + location: result.location, + category: cat, + div: make_search_result(result, query), + }); + } + } + + postMessage(filtered_results); + }; +} + +// `worker = Threads.@spawn worker_function(documenterSearchIndex)`, but in JavaScript! +const filters = [ + ...new Set(documenterSearchIndex["docs"].map((x) => x.category)), +]; +const worker_str = + "(" + + worker_function.toString() + + ")(" + + JSON.stringify(documenterSearchIndex["docs"]) + + "," + + JSON.stringify(documenterBaseURL) + + "," + + JSON.stringify(filters) + + ")"; +const worker_blob = new Blob([worker_str], { type: "text/javascript" }); +const worker = new Worker(URL.createObjectURL(worker_blob)); + +/////// SEARCH MAIN /////// + +// Whether the worker is currently handling a search. This is a boolean +// as the worker only ever handles 1 or 0 searches at a time. +var worker_is_running = false; + +// The last search text that was sent to the worker. This is used to determine +// if the worker should be launched again when it reports back results. +var last_search_text = ""; + +// The results of the last search. This, in combination with the state of the filters +// in the DOM, is used compute the results to display on calls to update_search. +var unfiltered_results = []; + +// Which filter is currently selected +var selected_filter = ""; + +$(document).on("input", ".documenter-search-input", function (event) { + if (!worker_is_running) { + launch_search(); + } }); +function launch_search() { + worker_is_running = true; + last_search_text = $(".documenter-search-input").val(); + worker.postMessage(last_search_text); +} + +worker.onmessage = function (e) { + if (last_search_text !== $(".documenter-search-input").val()) { + launch_search(); + } else { + worker_is_running = false; + } + + unfiltered_results = e.data; + update_search(); +}; + $(document).on("click", ".search-filter", function () { if ($(this).hasClass("search-filter-selected")) { - $(this).removeClass("search-filter-selected"); + selected_filter = ""; } else { - $(this).addClass("search-filter-selected"); + selected_filter = $(this).text().toLowerCase(); } - // Adding a debounce to prevent disruptions from crazy clicking! - debounce(() => get_filters(), 300); + // This updates search results and toggles classes for UI: + update_search(); }); -/** - * A debounce function, takes a function and an optional timeout in milliseconds - * - * @function callback - * @param {number} timeout - */ -function debounce(callback, timeout = 300) { - clearTimeout(timer); - timer = setTimeout(callback, timeout); -} - /** * Make/Update the search component - * - * @param {string[]} selected_filters */ -function update_search(selected_filters = []) { - let initial_search_body = ` -
Type something to get started!
- `; - +function update_search() { let querystring = $(".documenter-search-input").val(); if (querystring.trim()) { - results = index.search(querystring, { - filter: (result) => { - // Filtering results - if (selected_filters.length === 0) { - return result.score >= 1; - } else { - return ( - result.score >= 1 && selected_filters.includes(result.category) - ); - } - }, - }); + if (selected_filter == "") { + results = unfiltered_results; + } else { + results = unfiltered_results.filter((result) => { + return selected_filter == result.category.toLowerCase(); + }); + } let search_result_container = ``; + let modal_filters = make_modal_body_filters(); let search_divider = `
`; if (results.length) { @@ -449,19 +690,23 @@ function update_search(selected_filters = []) { let count = 0; let search_results = ""; - results.forEach(function (result) { - if (result.location) { - // Checking for duplication of results for the same page - if (!links.includes(result.location)) { - search_results += make_search_result(result, querystring); - count++; - } - + for (var i = 0, n = results.length; i < n && count < 200; ++i) { + let result = results[i]; + if (result.location && !links.includes(result.location)) { + search_results += result.div; + count++; links.push(result.location); } - }); + } - let result_count = `
${count} result(s)
`; + if (count == 1) { + count_str = "1 result"; + } else if (count == 200) { + count_str = "200+ results"; + } else { + count_str = count + " results"; + } + let result_count = `
${count_str}
`; search_result_container = `
@@ -490,125 +735,37 @@ function update_search(selected_filters = []) { $(".search-modal-card-body").html(search_result_container); } else { - filter_results = []; - modal_filters = make_modal_body_filters(filters, filter_results); - if (!$(".search-modal-card-body").hasClass("is-justify-content-center")) { $(".search-modal-card-body").addClass("is-justify-content-center"); } - $(".search-modal-card-body").html(initial_search_body); + $(".search-modal-card-body").html(` +
Type something to get started!
+ `); } } /** * Make the modal filter html * - * @param {string[]} filters - * @param {string[]} selected_filters * @returns string */ -function make_modal_body_filters(filters, selected_filters = []) { - let str = ``; - - filters.forEach((val) => { - if (selected_filters.includes(val)) { - str += `${val}`; - } else { - str += `${val}`; - } - }); +function make_modal_body_filters() { + let str = filters + .map((val) => { + if (selected_filter == val.toLowerCase()) { + return `${val}`; + } else { + return `${val}`; + } + }) + .join(""); - let filter_html = ` + return `
Filters: ${str} -
- `; - - return filter_html; -} - -/** - * Make the result component given a minisearch result data object and the value of the search input as queryString. - * To view the result object structure, refer: https://lucaong.github.io/minisearch/modules/_minisearch_.html#searchresult - * - * @param {object} result - * @param {string} querystring - * @returns string - */ -function make_search_result(result, querystring) { - let search_divider = `
`; - let display_link = - result.location.slice(Math.max(0), Math.min(50, result.location.length)) + - (result.location.length > 30 ? "..." : ""); // To cut-off the link because it messes with the overflow of the whole div - - if (result.page !== "") { - display_link += ` (${result.page})`; - } - - let textindex = new RegExp(`\\b${querystring}\\b`, "i").exec(result.text); - let text = - textindex !== null - ? result.text.slice( - Math.max(textindex.index - 100, 0), - Math.min( - textindex.index + querystring.length + 100, - result.text.length - ) - ) - : ""; // cut-off text before and after from the match - - let display_result = text.length - ? "..." + - text.replace( - new RegExp(`\\b${querystring}\\b`, "i"), // For first occurrence - '$&' - ) + - "..." - : ""; // highlights the match - - let in_code = false; - if (!["page", "section"].includes(result.category.toLowerCase())) { - in_code = true; - } - - // We encode the full url to escape some special characters which can lead to broken links - let result_div = ` - -
-
${result.title}
-
${result.category}
-
-

- ${display_result} -

-
- ${display_link} -
-
- ${search_divider} - `; - - return result_div; -} - -/** - * Get selected filters, remake the filter html and lastly update the search modal - */ -function get_filters() { - let ele = $(".search-filters .search-filter-selected").get(); - filter_results = ele.map((x) => $(x).text().toLowerCase()); - modal_filters = make_modal_body_filters(filters, filter_results); - update_search(filter_results); +
`; } }) @@ -635,103 +792,107 @@ $(document).ready(function () { //////////////////////////////////////////////////////////////////////////////// require(['jquery'], function($) { -let search_modal_header = ` - -`; - -let initial_search_body = ` -
Type something to get started!
-`; - -let search_modal_footer = ` - -`; - -$(document.body).append( - ` - +file = CSV.File(IOBuffer(data); pool=(0.5, 2)) diff --git a/dev/index.html b/dev/index.html index 4598a6c1..c6a97237 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,2 +1,2 @@ -Home · CSV.jl

CSV.jl Documentation

GitHub Repo: https://github.com/JuliaData/CSV.jl

Welcome to CSV.jl! A pure-Julia package for handling delimited text data, be it comma-delimited (csv), tab-delimited (tsv), or otherwise.

Installation

You can install CSV by typing the following in the Julia REPL:

] add CSV 

followed by

using CSV

to load the package.

Overview

To start out, let's discuss the high-level functionality provided by the package, which hopefully will help direct you to more specific documentation for your use-case:

  • CSV.File: the most commonly used function for ingesting delimited data; will read an entire data input or vector of data inputs, detecting number of columns and rows, along with the type of data for each column. Returns a CSV.File object, which is like a lightweight table/DataFrame. Assuming file is a variable of a CSV.File object, individual columns can be accessed like file.col1, file[:col1], or file["col"]. You can see parsed column names via file.names. A CSV.File can also be iterated, where a CSV.Row is produced on each iteration, which allows access to each value in the row via row.col1, row[:col1], or row[1]. You can also index a CSV.File directly, like file[1] to return the entire CSV.Row at the provided index/row number. Multiple threads will be used while parsing the input data if the input is large enough, and full return column buffers to hold the parsed data will be allocated. CSV.File satisfies the Tables.jl "source" interface, and so can be passed to valid sink functions like DataFrame, SQLite.load!, Arrow.write, etc. Supports a number of keyword arguments to control parsing, column type, and other file metadata options.
  • CSV.read: a convenience function identical to CSV.File, but used when a CSV.File will be passed directly to a sink function, like a DataFrame. In some cases, sinks may make copies of incoming data for their own safety; by calling CSV.read(file, DataFrame), no copies of the parsed CSV.File will be made, and the DataFrame will take direct ownership of the CSV.File's columns, which is more efficient than doing CSV.File(file) |> DataFrame which will result in an extra copy of each column being made. Keyword arguments are identical to CSV.File. Any valid Tables.jl sink function/table type can be passed as the 2nd argument. Like CSV.File, a vector of data inputs can be passed as the 1st argument, which will result in a single "long" table of all the inputs vertically concatenated. Each input must have identical schemas (column names and types).
  • CSV.Rows: an alternative approach for consuming delimited data, where the input is only consumed one row at a time, which allows "streaming" the data with a lower memory footprint than CSV.File. Supports many of the same options as CSV.File, except column type handling is a little different. By default, every column type will be essentially Union{Missing, String}, i.e. no automatic type detection is done, but column types can be provided manually. Multithreading is not used while parsing. After constructing a CSV.Rows object, rows can be "streamed" by iterating, where each iteration produces a CSV.Row2 object, which operates similar to CSV.File's CSV.Row type where individual row values can be accessed via row.col1, row[:col1], or row[1]. If each row is processed individually, additional memory can be saved by passing reusebuffer=true, which means a single buffer will be allocated to hold the values of only the currently iterated row. CSV.Rows also supports the Tables.jl interface and can also be passed to valid sink functions.
  • CSV.Chunks: similar to CSV.File, but allows passing a ntasks::Integer keyword argument which will cause the input file to be "chunked" up into ntasks number of chunks. After constructing a CSV.Chunks object, each iteration of the object will return a CSV.File of the next parsed chunk. Useful for processing extremely large files in "chunks". Because each iterated element is a valid Tables.jl "source", CSV.Chunks satisfies the Tables.partitions interface, so sinks that can process input partitions can operate by passing CSV.Chunks as the "source".
  • CSV.write: A valid Tables.jl "sink" function for writing any valid input table out in a delimited text format. Supports many options for controlling the output like delimiter, quote characters, etc. Writes data to an internal buffer, which is flushed out when full, buffer size is configurable. Also supports writing out partitioned inputs as separate output files, one file per input partition. To write out a DataFrame, for example, it's simply CSV.write("data.csv", df), or to write out a matrix, it's using Tables; CSV.write("data.csv", Tables.table(mat))
  • CSV.RowWriter: An alternative way to produce csv output; takes any valid Tables.jl input, and on each iteration, produces a single csv-formatted string from the input table's row.

That's quite a bit! Let's boil down a TL;DR:

  • Just want to read a delimited file or collection of files and do basic stuff with data? Use CSV.File(file) or CSV.read(file, DataFrame)
  • Don't need the data as a whole or want to stream through a large file row-by-row? Use CSV.Rows.
  • Want to process a large file in "batches"/chunks? Use CSV.Chunks.
  • Need to produce a csv? Use CSV.write.
  • Want to iterate an input table and produce a single csv string per row? CSV.RowWriter.

For the rest of the manual, we're going to have two big sections, Reading and Writing where we'll walk through the various options to CSV.File/CSV.read/CSV.Rows/CSV.Chunks and CSV.write/CSV.RowWriter.

+Home · CSV.jl

CSV.jl Documentation

GitHub Repo: https://github.com/JuliaData/CSV.jl

Welcome to CSV.jl! A pure-Julia package for handling delimited text data, be it comma-delimited (csv), tab-delimited (tsv), or otherwise.

Installation

You can install CSV by typing the following in the Julia REPL:

] add CSV 

followed by

using CSV

to load the package.

Overview

To start out, let's discuss the high-level functionality provided by the package, which hopefully will help direct you to more specific documentation for your use-case:

  • CSV.File: the most commonly used function for ingesting delimited data; will read an entire data input or vector of data inputs, detecting number of columns and rows, along with the type of data for each column. Returns a CSV.File object, which is like a lightweight table/DataFrame. Assuming file is a variable of a CSV.File object, individual columns can be accessed like file.col1, file[:col1], or file["col"]. You can see parsed column names via file.names. A CSV.File can also be iterated, where a CSV.Row is produced on each iteration, which allows access to each value in the row via row.col1, row[:col1], or row[1]. You can also index a CSV.File directly, like file[1] to return the entire CSV.Row at the provided index/row number. Multiple threads will be used while parsing the input data if the input is large enough, and full return column buffers to hold the parsed data will be allocated. CSV.File satisfies the Tables.jl "source" interface, and so can be passed to valid sink functions like DataFrame, SQLite.load!, Arrow.write, etc. Supports a number of keyword arguments to control parsing, column type, and other file metadata options.
  • CSV.read: a convenience function identical to CSV.File, but used when a CSV.File will be passed directly to a sink function, like a DataFrame. In some cases, sinks may make copies of incoming data for their own safety; by calling CSV.read(file, DataFrame), no copies of the parsed CSV.File will be made, and the DataFrame will take direct ownership of the CSV.File's columns, which is more efficient than doing CSV.File(file) |> DataFrame which will result in an extra copy of each column being made. Keyword arguments are identical to CSV.File. Any valid Tables.jl sink function/table type can be passed as the 2nd argument. Like CSV.File, a vector of data inputs can be passed as the 1st argument, which will result in a single "long" table of all the inputs vertically concatenated. Each input must have identical schemas (column names and types).
  • CSV.Rows: an alternative approach for consuming delimited data, where the input is only consumed one row at a time, which allows "streaming" the data with a lower memory footprint than CSV.File. Supports many of the same options as CSV.File, except column type handling is a little different. By default, every column type will be essentially Union{Missing, String}, i.e. no automatic type detection is done, but column types can be provided manually. Multithreading is not used while parsing. After constructing a CSV.Rows object, rows can be "streamed" by iterating, where each iteration produces a CSV.Row2 object, which operates similar to CSV.File's CSV.Row type where individual row values can be accessed via row.col1, row[:col1], or row[1]. If each row is processed individually, additional memory can be saved by passing reusebuffer=true, which means a single buffer will be allocated to hold the values of only the currently iterated row. CSV.Rows also supports the Tables.jl interface and can also be passed to valid sink functions.
  • CSV.Chunks: similar to CSV.File, but allows passing a ntasks::Integer keyword argument which will cause the input file to be "chunked" up into ntasks number of chunks. After constructing a CSV.Chunks object, each iteration of the object will return a CSV.File of the next parsed chunk. Useful for processing extremely large files in "chunks". Because each iterated element is a valid Tables.jl "source", CSV.Chunks satisfies the Tables.partitions interface, so sinks that can process input partitions can operate by passing CSV.Chunks as the "source".
  • CSV.write: A valid Tables.jl "sink" function for writing any valid input table out in a delimited text format. Supports many options for controlling the output like delimiter, quote characters, etc. Writes data to an internal buffer, which is flushed out when full, buffer size is configurable. Also supports writing out partitioned inputs as separate output files, one file per input partition. To write out a DataFrame, for example, it's simply CSV.write("data.csv", df), or to write out a matrix, it's using Tables; CSV.write("data.csv", Tables.table(mat))
  • CSV.RowWriter: An alternative way to produce csv output; takes any valid Tables.jl input, and on each iteration, produces a single csv-formatted string from the input table's row.

That's quite a bit! Let's boil down a TL;DR:

  • Just want to read a delimited file or collection of files and do basic stuff with data? Use CSV.File(file) or CSV.read(file, DataFrame)
  • Don't need the data as a whole or want to stream through a large file row-by-row? Use CSV.Rows.
  • Want to process a large file in "batches"/chunks? Use CSV.Chunks.
  • Need to produce a csv? Use CSV.write.
  • Want to iterate an input table and produce a single csv string per row? CSV.RowWriter.

For the rest of the manual, we're going to have two big sections, Reading and Writing where we'll walk through the various options to CSV.File/CSV.read/CSV.Rows/CSV.Chunks and CSV.write/CSV.RowWriter.

diff --git a/dev/objects.inv b/dev/objects.inv new file mode 100644 index 0000000000000000000000000000000000000000..a87b2f3f3ac492f2cfef280eca4bd06f8055bc18 GIT binary patch literal 1251 zcmV<91RVP#AX9K?X>NERX>N99Zgg*Qc_4OWa&u{KZXhxWBOp+6Z)#;@bUGkIQ&ui& zYziYFR%LQ?X>V>iATTa5FfK7O3L_v?Xk{RBWo=<;Ze(S0Aa78b#rNMXCQiPX<{x4c$}42O^@3)5WVwP5NOexVx*@cx5Ark7HGEV zZqghLni^S*DN-dV+nZnC;fG||aYgRIfJDC`hi~3|NT?c4)VlgztvRjUQsEcgw62r% zgYJMBV6{-YIV3Gb&{KRNMP8-r|ojb_A^w6=s?4ZvPOrB-|T(|>s&}h@KW!@?xR6pb}3XW2! z6yyJcJth!oaUD#~KYdi3+1$xKu4=cbL5gmfp67t10Vy09@?8`XI}+L$j3c9HM;lZT zQI6UD@|`QP)?KGng$H)a&4|s9_j_L|gvl(S-WY~eiAMFO|gg+)3 zqAaveE!XP7GF96`4UX`c&UZ*y5B27Tkgm=Dh|uT+*%>|b9W(ny-p;5MbvKSzo)#&- z<2}2KLMy^C2%vJJ`*+X|y<7%;oz=$)HLs8}{bEJZbtp%R^gN=FzI?u~extm_+i}zi zolJ??;Z*^;-n|%oEDYF#H}Y- z)s6K10Rxt^9{j~Kzz1a=lae`s>2v`-x4L<-UrAK5!aD`BoMFoL+QRE<$z)bj%ykwe z0fvw#pU3gS-~kir4ZVr@WPxQLi41$X=us}4pLG~ zKa2o{Mpj6dn`$tjttgVi?QGq(0_YJVYb7=a3#-yh_MCG-k%(mU=Lm zp|iR!u^IH#nO>#W$;y&vhrJBLSow%ApEg2ifFjR@1YhuqF^|@&g&&ifvyF5gzbva!N^Eg*^z&G zsP|b7-4?^6n~>9S*T8)ztS_6JL`a#&gSKBnopk^BS7T~?SrduSsd2S|GfDot2@mH->7qQK!no^ zcIgDStI`JkB-S^)Wm>}m3Z@N~4 N^XklO`5zKgSMarjVqpLP literal 0 HcmV?d00001 diff --git a/dev/reading.html b/dev/reading.html index c14d1729..fddfa084 100644 --- a/dev/reading.html +++ b/dev/reading.html @@ -18,7 +18,7 @@ │ String1 String1 String1 ─────┼─────────────────────────── 1 │ a b c - 2 │ 1 2 3

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.FileType
CSV.File(input; kwargs...) => CSV.File

Read a UTF-8 CSV input and return a CSV.File object, which is like a lightweight table/dataframe, allowing dot-access to columns and iterating rows. Satisfies the Tables.jl interface, so can be passed to any valid sink, yet to avoid unnecessary copies of data, use CSV.read(input, sink; kwargs...) instead if the CSV.File intermediate object isn't needed.

The input argument can be one of:

  • filename given as a string or FilePaths.jl type
  • a Vector{UInt8} or SubArray{UInt8, 1, Vector{UInt8}} byte buffer
  • a CodeUnits object, which wraps a String, like codeunits(str)
  • a csv-formatted string can also be passed like IOBuffer(str)
  • a Cmd or other IO
  • a gzipped file (or gzipped data in any of the above), which will automatically be decompressed for parsing
  • a Vector of any of the above, which will parse and vertically concatenate each source, returning a single, "long" CSV.File

To read a csv file from a url, use the Downloads.jl stdlib or HTTP.jl package, where the resulting downloaded tempfile or HTTP.Response body can be passed like:

using Downloads, CSV
+   2 │ 1        2        3

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.FileType
CSV.File(input; kwargs...) => CSV.File

Read a UTF-8 CSV input and return a CSV.File object, which is like a lightweight table/dataframe, allowing dot-access to columns and iterating rows. Satisfies the Tables.jl interface, so can be passed to any valid sink, yet to avoid unnecessary copies of data, use CSV.read(input, sink; kwargs...) instead if the CSV.File intermediate object isn't needed.

The input argument can be one of:

  • filename given as a string or FilePaths.jl type
  • a Vector{UInt8} or SubArray{UInt8, 1, Vector{UInt8}} byte buffer
  • a CodeUnits object, which wraps a String, like codeunits(str)
  • a csv-formatted string can also be passed like IOBuffer(str)
  • a Cmd or other IO
  • a gzipped file (or gzipped data in any of the above), which will automatically be decompressed for parsing
  • a Vector of any of the above, which will parse and vertically concatenate each source, returning a single, "long" CSV.File

To read a csv file from a url, use the Downloads.jl stdlib or HTTP.jl package, where the resulting downloaded tempfile or HTTP.Response body can be passed like:

using Downloads, CSV
 f = CSV.File(Downloads.download(url))
 
 # or
@@ -34,6 +34,6 @@
 
 # load a csv file directly into an sqlite database table
 db = SQLite.DB()
-tbl = CSV.File(file) |> SQLite.load!(db, "sqlite_table")

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.ChunksType
CSV.Chunks(source; ntasks::Integer=Threads.nthreads(), kwargs...) => CSV.Chunks

Returns a file "chunk" iterator. Accepts all the same inputs and keyword arguments as CSV.File, see those docs for explanations of each keyword argument.

The ntasks keyword argument specifies how many chunks a file should be split up into, defaulting to the # of threads available to Julia (i.e. JULIA_NUM_THREADS environment variable) or 8 if Julia is run single-threaded.

Each iteration of CSV.Chunks produces the next chunk of a file as a CSV.File. While initial file metadata detection is done only once (to determine # of columns, column names, etc), each iteration does independent type inference on columns. This is significant as different chunks may end up with different column types than previous chunks as new values are encountered in the file. Note that, as with CSV.File, types may be passed manually via the type or types keyword arguments.

This functionality is new and thus considered experimental; please open an issue if you run into any problems/bugs.

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.RowsType
CSV.Rows(source; kwargs...) => CSV.Rows

Read a csv input returning a CSV.Rows object.

The input argument can be one of:

  • filename given as a string or FilePaths.jl type
  • a Vector{UInt8} or SubArray{UInt8, 1, Vector{UInt8}} byte buffer
  • a CodeUnits object, which wraps a String, like codeunits(str)
  • a csv-formatted string can also be passed like IOBuffer(str)
  • a Cmd or other IO
  • a gzipped file (or gzipped data in any of the above), which will automatically be decompressed for parsing

To read a csv file from a url, use the HTTP.jl package, where the HTTP.Response body can be passed like:

f = CSV.Rows(HTTP.get(url).body)

For other IO or Cmd inputs, you can pass them like: f = CSV.Rows(read(obj)).

While similar to CSV.File, CSV.Rows provides a slightly different interface, the tradeoffs including:

  • Very minimal memory footprint; while iterating, only the current row values are buffered
  • Only provides row access via iteration; to access columns, one can stream the rows into a table type
  • Performs no type inference; each column/cell is essentially treated as Union{String, Missing}, users can utilize the performant Parsers.parse(T, str) to convert values to a more specific type if needed, or pass types upon construction using the type or types keyword arguments

Opens the file and uses passed arguments to detect the number of columns, ***but not*** column types (column types default to String unless otherwise manually provided). The returned CSV.Rows object supports the Tables.jl interface and can iterate rows. Each row object supports propertynames, getproperty, and getindex to access individual row values. Note that duplicate column names will be detected and adjusted to ensure uniqueness (duplicate column name a will become a_1). For example, one could iterate over a csv file with column names a, b, and c by doing:

for row in CSV.Rows(file)
+tbl = CSV.File(file) |> SQLite.load!(db, "sqlite_table")

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.ChunksType
CSV.Chunks(source; ntasks::Integer=Threads.nthreads(), kwargs...) => CSV.Chunks

Returns a file "chunk" iterator. Accepts all the same inputs and keyword arguments as CSV.File, see those docs for explanations of each keyword argument.

The ntasks keyword argument specifies how many chunks a file should be split up into, defaulting to the # of threads available to Julia (i.e. JULIA_NUM_THREADS environment variable) or 8 if Julia is run single-threaded.

Each iteration of CSV.Chunks produces the next chunk of a file as a CSV.File. While initial file metadata detection is done only once (to determine # of columns, column names, etc), each iteration does independent type inference on columns. This is significant as different chunks may end up with different column types than previous chunks as new values are encountered in the file. Note that, as with CSV.File, types may be passed manually via the type or types keyword arguments.

This functionality is new and thus considered experimental; please open an issue if you run into any problems/bugs.

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source
CSV.RowsType
CSV.Rows(source; kwargs...) => CSV.Rows

Read a csv input returning a CSV.Rows object.

The input argument can be one of:

  • filename given as a string or FilePaths.jl type
  • a Vector{UInt8} or SubArray{UInt8, 1, Vector{UInt8}} byte buffer
  • a CodeUnits object, which wraps a String, like codeunits(str)
  • a csv-formatted string can also be passed like IOBuffer(str)
  • a Cmd or other IO
  • a gzipped file (or gzipped data in any of the above), which will automatically be decompressed for parsing

To read a csv file from a url, use the HTTP.jl package, where the HTTP.Response body can be passed like:

f = CSV.Rows(HTTP.get(url).body)

For other IO or Cmd inputs, you can pass them like: f = CSV.Rows(read(obj)).

While similar to CSV.File, CSV.Rows provides a slightly different interface, the tradeoffs including:

  • Very minimal memory footprint; while iterating, only the current row values are buffered
  • Only provides row access via iteration; to access columns, one can stream the rows into a table type
  • Performs no type inference; each column/cell is essentially treated as Union{String, Missing}, users can utilize the performant Parsers.parse(T, str) to convert values to a more specific type if needed, or pass types upon construction using the type or types keyword arguments

Opens the file and uses passed arguments to detect the number of columns, ***but not*** column types (column types default to String unless otherwise manually provided). The returned CSV.Rows object supports the Tables.jl interface and can iterate rows. Each row object supports propertynames, getproperty, and getindex to access individual row values. Note that duplicate column names will be detected and adjusted to ensure uniqueness (duplicate column name a will become a_1). For example, one could iterate over a csv file with column names a, b, and c by doing:

for row in CSV.Rows(file)
     println("a=$(row.a), b=$(row.b), c=$(row.c)")
-end

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source

Utilities

CSV.detectFunction
CSV.detect(str::String)

Use the same logic used by CSV.File to detect column types, to parse a value from a plain string. This can be useful in conjunction with the CSV.Rows type, which returns each cell of a file as a String. The order of types attempted is: Int, Float64, Date, DateTime, Bool, and if all fail, the input String is returned. No errors are thrown. For advanced usage, you can pass your own Parsers.Options type as a keyword argument option=ops for sentinel value detection.

source

Common terms

Standard types

The types that are detected by default when column types are not provided by the user otherwise. They include: Int64, Float64, Date, DateTime, Time, Bool, and String.

Newlines

For all parsing functionality, newlines are detected/parsed automatically, regardless if they're present in the data as a single newline character ('\n'), single return character ('\r'), or full CRLF sequence ("\r\n").

Cardinality

Refers to the ratio of unique values to total number of values in a column. Columns with "low cardinality" have a low % of unique values, or put another way, there are only a few unique values for the entire column of data where unique values are repeated many times. Columns with "high cardinality" have a high % of unique values relative to total number of values. Think of these as "id-like" columns where each or almost each value is a unique identifier with no (or few) repeated values.

+end

Arguments

File layout options:

  • header=1: how column names should be determined; if given as an Integer, indicates the row to parse for column names; as an AbstractVector{<:Integer}, indicates a set of rows to be concatenated together as column names; Vector{Symbol} or Vector{String} give column names explicitly (should match # of columns in dataset); if a dataset doesn't have column names, either provide them as a Vector, or set header=0 or header=false and column names will be auto-generated (Column1, Column2, etc.). Note that if a row number header and comment or ignoreemptyrows are provided, the header row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
  • normalizenames::Bool=false: whether column names should be "normalized" into valid Julia identifier symbols; useful when using the tbl.col1 getproperty syntax or iterating rows and accessing column values of a row via getproperty (e.g. row.col1)
  • skipto::Integer: specifies the row where the data starts in the csv file; by default, the next row after the header row(s) is used. If header=0, then the 1st row is assumed to be the start of data; providing a skipto argument does not affect the header argument. Note that if a row number skipto and comment or ignoreemptyrows are provided, the data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
  • footerskip::Integer: number of rows at the end of a file to skip parsing. Do note that commented rows (see the comment keyword argument) do not count towards the row number provided for footerskip, they are completely ignored by the parser
  • transpose::Bool: read a csv file "transposed", i.e. each column is parsed as a row
  • comment::String: string that will cause rows that begin with it to be skipped while parsing. Note that if a row number header or skipto and comment are provided, the header/data row will be the first non-commented/non-empty row after the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
  • ignoreemptyrows::Bool=true: whether empty rows in a file should be ignored (if false, each column will be assigned missing for that empty row)
  • select: an AbstractVector of Integer, Symbol, String, or Bool, or a "selector" function of the form (i, name) -> keep::Bool; only columns in the collection or for which the selector function returns true will be parsed and accessible in the resulting CSV.File. Invalid values in select are ignored.
  • drop: inverse of select; an AbstractVector of Integer, Symbol, String, or Bool, or a "drop" function of the form (i, name) -> drop::Bool; columns in the collection or for which the drop function returns true will ignored in the resulting CSV.File. Invalid values in drop are ignored.
  • limit: an Integer to indicate a limited number of rows to parse in a csv file; use in combination with skipto to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the limit argument may not result in an exact # of rows parsed; use ntasks=1 to ensure an exact limit if necessary
  • buffer_in_memory: a Bool, default false, which controls whether a Cmd, IO, or gzipped source will be read/decompressed in memory vs. using a temporary file.
  • ntasks::Integer=Threads.nthreads(): [not applicable to CSV.Rows] for multithreaded parsed files, this controls the number of tasks spawned to read a file in concurrent chunks; defaults to the # of threads Julia was started with (i.e. JULIA_NUM_THREADS environment variable or julia -t N); setting ntasks=1 will avoid any calls to Threads.@spawn and just read the file serially on the main thread; a single thread will also be used for smaller files by default (< 5_000 cells)
  • rows_to_check::Integer=30: [not applicable to CSV.Rows] a multithreaded parsed file will be split up into ntasks # of equal chunks; rows_to_check controls the # of rows are checked to ensure parsing correctly found valid rows; for certain files with very large quoted text fields, lines_to_check may need to be higher (10, 30, etc.) to ensure parsing correctly finds these rows
  • source: [only applicable for vector of inputs to CSV.File] a Symbol, String, or Pair of Symbol or String to Vector. As a single Symbol or String, provides the column name that will be added to the parsed columns, the values of the column will be the input "name" (usually file name) of the input from whence the value was parsed. As a Pair, the 2nd part of the pair should be a Vector of values matching the length of the # of inputs, where each value will be used instead of the input name for that inputs values in the auto-added column.

Parsing options:

  • missingstring: either a nothing, String, or Vector{String} to use as sentinel values that will be parsed as missing; if nothing is passed, no sentinel/missing values will be parsed; by default, missingstring="", which means only an empty field (two consecutive delimiters) is considered missing
  • delim=',': a Char or String that indicates how columns are delimited in a file; if no argument is provided, parsing will try to detect the most consistent delimiter on the first 10 rows of the file
  • ignorerepeated::Bool=false: whether repeated (consecutive/sequential) delimiters should be ignored while parsing; useful for fixed-width files with delimiter padding between cells
  • quoted::Bool=true: whether parsing should check for quotechar at the start/end of cells
  • quotechar='"', openquotechar, closequotechar: a Char (or different start and end characters) that indicate a quoted field which may contain textual delimiters or newline characters
  • escapechar='"': the Char used to escape quote characters in a quoted field
  • dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an AbstractDict, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index Int, or name Symbol or String to the format string for that column.
  • decimal='.': a Char indicating how decimals are separated in floats, i.e. 3.14 uses '.', or 3,14 uses a comma ','
  • groupmark=nothing: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (1,000.00).
  • truestrings, falsestrings: Vector{String}s that indicate how true or false values are represented; by default "true", "True", "TRUE", "T", "1" are used to detect true and "false", "False", "FALSE", "F", "0" are used to detect false; note that columns with only 1 and 0 values will default to Int64 column type unless explicitly requested to be Bool via types keyword argument
  • stripwhitespace=false: if true, leading and trailing whitespace are stripped from string values, including column names

Column Type Options:

  • types: a single Type, AbstractVector or AbstractDict of types, or a function of the form (i, name) -> Union{T, Nothing} to be used for column types; if a single Type is provided, all columns will be parsed with that single type; an AbstractDict can map column index Integer, or name Symbol or String to type for a column, i.e. Dict(1=>Float64) will set the first column as a Float64, Dict(:column1=>Float64) will set the column named column1 to Float64 and, Dict("column1"=>Float64) will set the column1 to Float64; if a Vector is provided, it must match the # of columns provided or detected in header. If a function is provided, it takes a column index and name as arguments, and should return the desired column type for the column, or nothing to signal the column's type should be detected while parsing.
  • typemap::IdDict{Type, Type}: a mapping of a type that should be replaced in every instance with another type, i.e. Dict(Float64=>String) would change every detected Float64 column to be parsed as String; only "standard" types are allowed to be mapped to another type, i.e. Int64, Float64, Date, DateTime, Time, and Bool. If a column of one of those types is "detected", it will be mapped to the specified type.
  • pool::Union{Bool, Real, AbstractVector, AbstractDict, Function, Tuple{Float64, Int}}=(0.2, 500): [not supported by CSV.Rows] controls whether columns will be built as PooledArray; if true, all columns detected as String will be pooled; alternatively, the proportion of unique values below which String columns should be pooled (meaning that if the # of unique strings in a column is under 25%, pool=0.25, it will be pooled). If provided as a Tuple{Float64, Int} like (0.2, 500), it represents the percent cardinality threshold as the 1st tuple element (0.2), and an upper limit for the # of unique values (500), under which the column will be pooled; this is the default (pool=(0.2, 500)). If an AbstractVector, each element should be Bool, Real, or Tuple{Float64, Int} and the # of elements should match the # of columns in the dataset; if an AbstractDict, a Bool, Real, or Tuple{Float64, Int} value can be provided for individual columns where the dict key is given as column index Integer, or column name as Symbol or String. If a function is provided, it should take a column index and name as 2 arguments, and return a Bool, Real, Tuple{Float64, Int}, or nothing for each column.
  • downcast::Bool=false: controls whether columns detected as Int64 will be "downcast" to the smallest possible integer type like Int8, Int16, Int32, etc.
  • stringtype=InlineStrings.InlineString: controls how detected string columns will ultimately be returned; default is InlineString, which stores string data in a fixed-size primitive type that helps avoid excessive heap memory usage; if a column has values longer than 32 bytes, it will default to String. If String is passed, all string columns will just be normal String values. If PosLenString is passed, string columns will be returned as PosLenStringVector, which is a special "lazy" AbstractVector that acts as a "view" into the original file data. This can lead to the most efficient parsing times, but note that the "view" nature of PosLenStringVector makes it read-only, so operations like push!, append!, or setindex! are not supported. It also keeps a reference to the entire input dataset source, so trying to modify or delete the underlying file, for example, may fail
  • strict::Bool=false: whether invalid values should throw a parsing error or be replaced with missing
  • silencewarnings::Bool=false: if strict=false, whether invalid value warnings should be silenced
  • maxwarnings::Int=100: if more than maxwarnings number of warnings are printed while parsing, further warnings will be silenced by default; for multithreaded parsing, each parsing task will print up to maxwarnings
  • debug::Bool=false: passing true will result in many informational prints while a dataset is parsed; can be useful when reporting issues or figuring out what is going on internally while a dataset is parsed
  • validate::Bool=true: whether or not to validate that columns specified in the types, dateformat and pool keywords are actually found in the data. If false no validation is done, meaning no error will be thrown if types/dateformat/pool specify settings for columns not actually found in the data.

Iteration options:

  • reusebuffer=false: [only supported by CSV.Rows] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing collect(CSV.Rows(file)) because only current iterated row is "valid")
source

Utilities

CSV.detectFunction
CSV.detect(str::String)

Use the same logic used by CSV.File to detect column types, to parse a value from a plain string. This can be useful in conjunction with the CSV.Rows type, which returns each cell of a file as a String. The order of types attempted is: Int, Float64, Date, DateTime, Bool, and if all fail, the input String is returned. No errors are thrown. For advanced usage, you can pass your own Parsers.Options type as a keyword argument option=ops for sentinel value detection.

source

Common terms

Standard types

The types that are detected by default when column types are not provided by the user otherwise. They include: Int64, Float64, Date, DateTime, Time, Bool, and String.

Newlines

For all parsing functionality, newlines are detected/parsed automatically, regardless if they're present in the data as a single newline character ('\n'), single return character ('\r'), or full CRLF sequence ("\r\n").

Cardinality

Refers to the ratio of unique values to total number of values in a column. Columns with "low cardinality" have a low % of unique values, or put another way, there are only a few unique values for the entire column of data where unique values are repeated many times. Columns with "high cardinality" have a high % of unique values relative to total number of values. Think of these as "id-like" columns where each or almost each value is a unique identifier with no (or few) repeated values.

diff --git a/dev/writing.html b/dev/writing.html index 235111de..612161de 100644 --- a/dev/writing.html +++ b/dev/writing.html @@ -9,4 +9,4 @@ # write a matrix to an in-memory IOBuffer io = IOBuffer() mat = rand(10, 10) -CSV.write(io, Tables.table(mat))source
CSV.RowWriterType
CSV.RowWriter(table; kwargs...)

Creates an iterator that produces csv-formatted strings for each row in the input table.

Supported keyword arguments include:

  • bufsize::Int=2^22: The length of the buffer to use when writing each csv-formatted row; default 4MB; if a row is larger than the bufsize an error is thrown
  • delim::Union{Char, String}=',': a character or string to print out as the file's delimiter
  • quotechar::Char='"': ascii character to use for quoting text fields that may contain delimiters or newlines
  • openquotechar::Char: instead of quotechar, use openquotechar and closequotechar to support different starting and ending quote characters
  • escapechar::Char='"': ascii character used to escape quote characters in a text field
  • missingstring::String="": string to print for missing values
  • dateformat=Dates.default_format(T): the date format string to use for printing out Date & DateTime columns
  • header: pass a list of column names (Symbols or Strings) to use instead of the column names of the input table
  • newline='\n': character or string to use to separate rows (lines in the csv file)
  • quotestrings=false: whether to force all strings to be quoted or not
  • decimal='.': character to use as the decimal point when writing floating point numbers
  • transform=(col,val)->val: a function that is applied to every cell e.g. we can transform all nothing values to missing using (col, val) -> something(val, missing)
  • bom=false: whether to write a UTF-8 BOM header (0xEF 0xBB 0xBF) or not
source
+CSV.write(io, Tables.table(mat))source
CSV.RowWriterType
CSV.RowWriter(table; kwargs...)

Creates an iterator that produces csv-formatted strings for each row in the input table.

Supported keyword arguments include:

  • bufsize::Int=2^22: The length of the buffer to use when writing each csv-formatted row; default 4MB; if a row is larger than the bufsize an error is thrown
  • delim::Union{Char, String}=',': a character or string to print out as the file's delimiter
  • quotechar::Char='"': ascii character to use for quoting text fields that may contain delimiters or newlines
  • openquotechar::Char: instead of quotechar, use openquotechar and closequotechar to support different starting and ending quote characters
  • escapechar::Char='"': ascii character used to escape quote characters in a text field
  • missingstring::String="": string to print for missing values
  • dateformat=Dates.default_format(T): the date format string to use for printing out Date & DateTime columns
  • header: pass a list of column names (Symbols or Strings) to use instead of the column names of the input table
  • newline='\n': character or string to use to separate rows (lines in the csv file)
  • quotestrings=false: whether to force all strings to be quoted or not
  • decimal='.': character to use as the decimal point when writing floating point numbers
  • transform=(col,val)->val: a function that is applied to every cell e.g. we can transform all nothing values to missing using (col, val) -> something(val, missing)
  • bom=false: whether to write a UTF-8 BOM header (0xEF 0xBB 0xBF) or not
source