Source code for spoonbill.cli

"""cli.py - Command line interface related routines"""

import logging
import pathlib

import click
import click_logging

from spoonbill import FileAnalyzer, FileFlattener
from spoonbill.common import COMBINED_TABLES, ROOT_TABLES, TABLE_THRESHOLD
from spoonbill.flatten import FlattenOptions
from spoonbill.i18n import LOCALE, _
from spoonbill.utils import read_lines

LOGGER = logging.getLogger("spoonbill")
click_logging.basic_config(LOGGER)


ANALYZED_LABEL = _("  Processed {} objects")
FLATTENED_LABEL = _("  Flattened {} objects")
CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]}


[docs] class CommaSeparated(click.ParamType): """Click option type to convert comma separated string into list""" name = "comma"
[docs] def convert(self, value, param, ctx): if not value: return [] return value.split(",")
[docs] def read_option_file(option, option_file): if option_file: option = read_lines(option_file) return option
[docs] def get_selected_tables(base, selection): for name in selection: if name not in base: msg = _("Wrong selection, table '{}' does not exist").format(name) raise click.BadParameter(msg) return {name: tab for name, tab in base.items() if name in selection}
@click.command(context_settings=CONTEXT_SETTINGS, help=_("CLI tool to flatten OCDS files")) @click.option( "--schema", help=_( "A JSON schema file URI. The URI can be a file path or an HTTP link. Spoonbill uses the schema to analyze the " "provided JSON file. Defaults to the OCDS 1.1.5 release schema (requires internet connection)" ), type=str, ) @click.option( "--selection", type=CommaSeparated(), help=_( "A comma-separated list of initial tables to write. The available tables to select are: " "parties, planning, tenders, awards, contracts" ), ) @click.option( "--threshold", help=_("The maximum number of elements in an array before it is split into a table"), type=int, default=TABLE_THRESHOLD, show_default=True, ) @click.option( "--state-file", help=_("A file path URI to a previously generated state file. If not provided, a new state file is generated"), type=click.Path(exists=True), ) @click.option( "--xlsx", help=_( "A file path to store the resulting xlsx file. Default to result.xlsx. " "Set to '' to disable the xlsx file generation" ), type=click.Path(), default="result.xlsx", ) @click.option( "--csv", help=_("An existing directory path. If set also generates CSV files in the given directory. Disabled by default"), type=click.Path(), required=False, ) @click.option( "--combine", help=_( "A comma-separated list of tables. Combines same OCDS object types from different locations " "(tender, awards, etc) into a single table. The available tables are: documents, milestones, and amendments" ), type=CommaSeparated(), ) @click.option( "--exclude", help=_("A comma-separated list of tables to exclude from export. Disabled by default"), type=CommaSeparated(), default="", ) @click.option( "--unnest", help=_( "A comma-separated list of column names to copy from child tables into their parent table. Disabled by default" ), type=CommaSeparated(), default="", ) @click.option( "--unnest-file", help=_("A file path directory. Same as --unnest, but read column names from a file with one column per line"), type=click.Path(exists=True), required=False, ) @click.option( "--only", help=_( "A comma-separated list of a subset of columns to output instead of all, in JSON path format, " "e.g. /parties/name. Defaults to all the available columns" ), type=CommaSeparated(), default="", ) @click.option( "--only-file", help=_("A file path directory. Same as --only, but read the columns names from a file with one column per line"), type=click.Path(exists=True), required=False, ) @click.option( "--repeat", help=_( "A comma-separated list of columns to repeat from a parent table into its child tables, in JSON path format," "e.g. /parties/name. Disabled by default" ), type=CommaSeparated(), default="", ) @click.option( "--repeat-file", help=_("A file path directory. Same as --repeat, but read the columns names from a file with one column per line"), type=click.Path(exists=True), required=False, ) @click.option( "--count", help=_("For each array field, add a count column to its parent table. Disabled by default"), is_flag=True, default=False, ) @click.option( "--human", help=_("Change the tables headings to human-readable format, using the schema's title properties"), is_flag=True, ) @click.option( "--language", help=_("Use with --human, the language to use for the human-readable headings"), default=LOCALE, show_default=True, type=click.Choice(["en", "es"]), ) @click_logging.simple_verbosity_option(LOGGER) @click.argument("filename", type=click.Path(exists=True)) def cli( filename, schema, selection, threshold, state_file, xlsx, csv, combine, exclude, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter(_("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter(_("Desired location {} does not exists").format(xlsx.parent)) path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho(_("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) for name, option in ("threshold", str(threshold)), ("language", language): click.echo(_(" - {:30} => {}").format(name, click.style(option, fg="cyan"))) click.echo(_("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=False): bar.label = ANALYZED_LABEL.format(click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho( _("Done processing. Analyzed objects: {}").format(click.style(str(number + 1), fg="red")), fg="green" ) if isinstance(filename, list): state_file = pathlib.Path(f"{filename[0]}.state") else: state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo(_("Dumping analyzed data to '{}'").format(click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo(_("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError(_("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError(_("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) if exclude: click.echo(_("Ignoring tables (excluded by user): {}").format(click.style(",".join(exclude), fg="red"))) options = {"selection": {}, "count": count, "exclude": exclude} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in list(selection) + list(combine): table = analyzer.spec[name] if table.total_rows == 0: click.echo(_("Ignoring empty table {}").format(click.style(name, fg="red"))) continue options["selection"][name] = { "split": analyzer.spec[name].splitted, "pretty_headers": human, } if not analyzer.spec[name].is_combined: unnest_in_table = [col for col in unnest if col in table.combined_columns] if unnest_in_table: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest_in_table), fg="cyan"), click.style(name, fg="cyan") ) ) only_in_table = [col for col in only if col in table] if only_in_table: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only_in_table), fg="cyan"), click.style(name, fg="cyan") ) ) repeat_in_table = [col for col in repeat if col in table] if repeat_in_table: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat_in_table), fg="cyan"), click.style(name, fg="cyan") ) ) options["selection"][name]["only"] = only_in_table options["selection"][name]["repeat"] = repeat_in_table options["selection"][name]["unnest"] = unnest_in_table options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer, csv=csv, xlsx=xlsx, language=language, ) click.echo( _("Going to export tables: {}").format(click.style(",".join(flattener.flattener.tables.keys()), fg="magenta")) ) click.echo(_("Processed tables:")) for table_name, table in flattener.flattener.tables.items(): msg = _(" - {:30} => {} rows") if table.is_root else _(" ---- {:27} => {} rows") message = msg.format(table_name, click.style(str(table.total_rows), fg="cyan")) click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format(click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format(click.style(str(count + 1), fg="red")), fg="green")