From 8d700e816991fa79cc1270381f402a2080c326f4 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 27 Apr 2024 18:18:34 -0400 Subject: [PATCH] csvclean: Use standard output and standard error, and use exit code 1 if errors #781 #195 --- CHANGELOG.rst | 7 ++ csvkit/cleanup.py | 11 +-- csvkit/cli.py | 9 +- csvkit/utilities/csvclean.py | 72 +++----------- docs/scripts/csvclean.rst | 16 ++-- tests/test_utilities/test_csvclean.py | 133 ++++++++++++-------------- 6 files changed, 103 insertions(+), 145 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index eebf5bbd6..0cf6ace88 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,10 @@ +2.0.0 - Unreleased +------------------ + +**BACKWARDS-INCOMPATIBLE CHANGES** + +* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``. + 1.5.0 - March 28, 2024 ---------------------- diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py index aa8359c3d..818d268bd 100644 --- a/csvkit/cleanup.py +++ b/csvkit/cleanup.py @@ -5,8 +5,10 @@ def join_rows(rows, joiner=' '): """ - Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a - single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg. + Given a series of rows, return them as a single row where the inner edge cells are merged. + + :param joiner: + The separator between cells, a single space by default. """ rows = list(rows) fixed_row = rows[0][:] @@ -33,8 +35,6 @@ def __init__(self, reader): except StopIteration: self.column_names = [] self.errors = [] - self.rows_joined = 0 - self.joins = 0 def checked_rows(self): """ @@ -69,9 +69,6 @@ def checked_rows(self): break if len(fixed_row) == length: - self.rows_joined += len(joinable_row_errors) - self.joins += 1 - yield fixed_row for fixed in joinable_row_errors: diff --git a/csvkit/cli.py b/csvkit/cli.py index f8c3ba432..6dabc6bfe 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -68,19 +68,26 @@ class CSVKitUtility: epilog = '' override_flags = '' - def __init__(self, args=None, output_file=None): + def __init__(self, args=None, output_file=None, error_file=None): """ Perform argument processing and other setup for a CSVKitUtility. """ self._init_common_parser() self.add_arguments() self.args = self.argparser.parse_args(args) + # Output file is only set during testing. if output_file is None: self.output_file = sys.stdout else: self.output_file = output_file + # Error file is only set during testing. + if error_file is None: + self.error_file = sys.stderr + else: + self.error_file = error_file + self.reader_kwargs = self._extract_csv_reader_kwargs() self.writer_kwargs = self._extract_csv_writer_kwargs() diff --git a/csvkit/utilities/csvclean.py b/csvkit/utilities/csvclean.py index 2dc082550..2b92bfd77 100644 --- a/csvkit/utilities/csvclean.py +++ b/csvkit/utilities/csvclean.py @@ -1,7 +1,6 @@ #!/usr/bin/env python import sys -from os.path import splitext import agate @@ -14,9 +13,7 @@ class CSVClean(CSVKitUtility): override_flags = ['L', 'blanks', 'date-format', 'datetime-format'] def add_arguments(self): - self.argparser.add_argument( - '-n', '--dry-run', dest='dryrun', action='store_true', - help='Do not create output files. Information about what would have been done will be printed to STDERR.') + pass def main(self): if self.additional_input_expected(): @@ -24,65 +21,20 @@ def main(self): reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs) - if self.args.dryrun: - checker = RowChecker(reader) + checker = RowChecker(reader) - for _row in checker.checked_rows(): - pass + output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs) + output_writer.writerow(checker.column_names) + for row in checker.checked_rows(): + output_writer.writerow(row) - if checker.errors: - for e in checker.errors: - self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) - else: - self.output_file.write('No errors.\n') + if checker.errors: + error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs) + error_writer.writerow(['line_number', 'msg'] + checker.column_names) + for error in checker.errors: + error_writer.writerow([error.line_number, error.msg] + error.row) - if checker.joins: - self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected ' - 'internal line breaks.\n' % (checker.rows_joined, checker.joins)) - else: - if self.input_file == sys.stdin: - base = 'stdin' # "_out.csv" is invalid on Windows - else: - base = splitext(self.input_file.name)[0] - - with open(f'{base}_out.csv', 'w') as f: - clean_writer = agate.csv.writer(f, **self.writer_kwargs) - - checker = RowChecker(reader) - clean_writer.writerow(checker.column_names) - - for row in checker.checked_rows(): - clean_writer.writerow(row) - - if checker.errors: - error_filename = f'{base}_err.csv' - - with open(error_filename, 'w') as f: - error_writer = agate.csv.writer(f, **self.writer_kwargs) - - error_header = ['line_number', 'msg'] - error_header.extend(checker.column_names) - error_writer.writerow(error_header) - - error_count = len(checker.errors) - - for e in checker.errors: - error_writer.writerow(self._format_error_row(e)) - - self.output_file.write('%i error%s logged to %s\n' % ( - error_count, '' if error_count == 1 else 's', error_filename)) - else: - self.output_file.write('No errors.\n') - - if checker.joins: - self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal ' - 'line breaks.\n' % (checker.rows_joined, checker.joins)) - - def _format_error_row(self, error): - row = [error.line_number, error.msg] - row.extend(error.row) - - return row + sys.exit(1) def launch_new_instance(): diff --git a/docs/scripts/csvclean.rst b/docs/scripts/csvclean.rst index 0e3e16bcd..f94d6a210 100644 --- a/docs/scripts/csvclean.rst +++ b/docs/scripts/csvclean.rst @@ -18,13 +18,13 @@ Note that every csvkit tool does the following: * changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option * changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option -Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions: +All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1:: .. code-block:: none usage: csvclean [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b] [-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H] - [-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-n] + [-K SKIP_LINES] [-v] [-l] [--zero] [-V] [FILE] Fix common errors in a CSV file. @@ -35,8 +35,6 @@ Outputs [basename]_out.csv and [basename]_err.csv, the former containing all val optional arguments: -h, --help show this help message and exit - -n, --dry-run Do not create output files. Information about what - would have been done will be printed to STDERR. See also: :doc:`../common_arguments`. @@ -47,9 +45,13 @@ Test a file with known bad rows: .. code-block:: console - $ csvclean -n examples/bad.csv - Line 1: Expected 3 columns, found 4 columns - Line 2: Expected 3 columns, found 2 columns + $ csvclean examples/bad.csv 2> errors.csv + column_a,column_b,column_c + 0,mixed types.... uh oh,17 + $ cat errors.csv + line_number,msg,column_a,column_b,column_c + 1,"Expected 3 columns, found 4 columns",1,27,,I'm too long! + 2,"Expected 3 columns, found 2 columns",,I'm too short! To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use: diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py index 1d284c942..754f75ab5 100644 --- a/tests/test_utilities/test_csvclean.py +++ b/tests/test_utilities/test_csvclean.py @@ -3,6 +3,8 @@ import sys from unittest.mock import patch +import agate + from csvkit.utilities.csvclean import CSVClean, launch_new_instance from tests.utils import CSVKitTestCase, EmptyFileTests @@ -15,98 +17,89 @@ def tearDown(self): if os.path.isfile(output_file): os.remove(output_file) - def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]): - args = [f'examples/{basename}.csv'] + additional_args + def assertCleaned(self, args, output_rows, error_rows=[]): output_file = io.StringIO() + error_file = io.StringIO() - utility = CSVClean(args, output_file) - utility.run() + utility = CSVClean(args, output_file, error_file) - output_file.close() + if error_rows: + with self.assertRaises(SystemExit) as e: + utility.run() + + self.assertEqual(e.exception.code, 1) + else: + utility.run() + + output_file.seek(0) + error_file.seek(0) - output_file = f'examples/{basename}_out.csv' - error_file = f'examples/{basename}_err.csv' - - self.assertEqual(os.path.exists(output_file), bool(output_lines)) - self.assertEqual(os.path.exists(error_file), bool(error_lines)) - - try: - if output_lines: - with open(output_file) as f: - for line in output_lines: - self.assertEqual(next(f), line) - self.assertRaises(StopIteration, next, f) - if error_lines: - with open(error_file) as f: - for line in error_lines: - self.assertEqual(next(f), line) - self.assertRaises(StopIteration, next, f) - finally: - if output_lines: - os.remove(output_file) - if error_lines: - os.remove(error_file) + if output_rows: + reader = agate.csv.reader(output_file) + for row in output_rows: + self.assertEqual(next(reader), row) + self.assertRaises(StopIteration, next, reader) + if error_rows: + reader = agate.csv.reader(error_file) + for row in error_rows: + self.assertEqual(next(reader), row) + self.assertRaises(StopIteration, next, reader) + + output_file.close() + error_file.close() def test_launch_new_instance(self): - with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']): + with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']): launch_new_instance() def test_skip_lines(self): - self.assertCleaned('bad_skip_lines', [ - 'column_a,column_b,column_c\n', - '0,mixed types.... uh oh,17\n', + self.assertCleaned(['--skip-lines', '3', 'examples/bad_skip_lines.csv'], [ + ['column_a', 'column_b', 'column_c'], + ['0', 'mixed types.... uh oh', '17'], ], [ - 'line_number,msg,column_a,column_b,column_c\n', - '1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n', - '2,"Expected 3 columns, found 2 columns",,I\'m too short!\n', - ], ['--skip-lines', '3']) + ['line_number', 'msg', 'column_a', 'column_b', 'column_c'], + ['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"], + ['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"], + ]) def test_simple(self): - self.assertCleaned('bad', [ - 'column_a,column_b,column_c\n', - '0,mixed types.... uh oh,17\n', + self.assertCleaned(['examples/bad.csv'], [ + ['column_a', 'column_b', 'column_c'], + ['0', 'mixed types.... uh oh', '17'], ], [ - 'line_number,msg,column_a,column_b,column_c\n', - '1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n', - '2,"Expected 3 columns, found 2 columns",,I\'m too short!\n', + ['line_number', 'msg', 'column_a', 'column_b', 'column_c'], + ['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"], + ['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"], ]) def test_no_header_row(self): - self.assertCleaned('no_header_row', [ - '1,2,3\n', + self.assertCleaned(['examples/no_header_row.csv'], [ + ['1', '2', '3'], ], []) def test_removes_optional_quote_characters(self): - self.assertCleaned('optional_quote_characters', [ - 'a,b,c\n', - '1,2,3\n', - ], []) + self.assertCleaned(['examples/optional_quote_characters.csv'], [ + ['a', 'b', 'c'], + ['1', '2', '3'], + ]) def test_changes_line_endings(self): - self.assertCleaned('mac_newlines', [ - 'a,b,c\n', - '1,2,3\n', - '"Once upon\n', - 'a time",5,6\n', - ], []) + self.assertCleaned(['examples/mac_newlines.csv'], [ + ['a', 'b', 'c'], + ['1', '2', '3'], + ['Once upon\na time', '5', '6'], + ]) def test_changes_character_encoding(self): - self.assertCleaned('test_latin1', [ - 'a,b,c\n', - '1,2,3\n', - '4,5,©\n', - ], [], ['-e', 'latin1']) + self.assertCleaned(['-e', 'latin1', 'examples/test_latin1.csv'], [ + ['a', 'b', 'c'], + ['1', '2', '3'], + ['4', '5', u'©'], + ]) def test_removes_bom(self): - self.assertCleaned('test_utf8_bom', [ - 'foo,bar,baz\n', - '1,2,3\n', - '4,5,ʤ\n', - ], [], []) - - def test_dry_run(self): - output = self.get_output_as_io(['-n', 'examples/bad.csv']) - self.assertFalse(os.path.exists('examples/bad_err.csv')) - self.assertFalse(os.path.exists('examples/bad_out.csv')) - self.assertEqual(next(output)[:6], 'Line 1') - self.assertEqual(next(output)[:6], 'Line 2') + self.assertCleaned(['examples/test_utf8_bom.csv'], [ + ['foo', 'bar', 'baz'], + ['1', '2', '3'], + ['4', '5', 'ʤ'], + ])