diff --git a/django/website/chn_spreadsheet/importer.py b/django/website/chn_spreadsheet/importer.py new file mode 100644 index 0000000000000000000000000000000000000000..0b829c8b9753c0c56658e5886d4dfd97e7010758 --- /dev/null +++ b/django/website/chn_spreadsheet/importer.py @@ -0,0 +1,156 @@ +import dateutil.parser +from decimal import Decimal + +from django.utils.translation import ugettext as _ +from openpyxl import load_workbook + +from .models import SheetProfile + + +class SheetImportException(Exception): + pass + + +def get_profile(label): + try: + sheet_profile = SheetProfile.objects.get(label=label) + except SheetProfile.DoesNotExist: + error_msg = _('Misconfigured service. Source "%s" does not exist') % label + raise SheetImportException(error_msg) + return sheet_profile.profile + + +def get_columns_map(col_list): + '''This function assumes that column names are unique for spreadsheet. + If they are not, then you already have a problem.''' + + # Python 2.7 (should be faster than a loop) + cols = { + column['name']: { + 'type': column['type'], + 'field': column['field'] + } for column in col_list + } + return cols + + +def get_rows_iterator(spreadsheet, file_format): + if file_format == 'excel': + try: + wb = load_workbook(spreadsheet, read_only=True) + ws = wb[wb.sheetnames[0]] + except: + error_msg = _('Expected excel file. Received file in an unrecognized format.') + raise SheetImportException(error_msg) + rows = ws.rows + else: + error_msg = _('Unsupported file format: %s') % file_format + raise SheetImportException(error_msg) + return rows + + +def order_columns(profile_columns, first_row=None): + columns = [] + if first_row: + col_map = get_columns_map(profile_columns) + for label in first_row: + try: + columns.append(col_map[label]) + except: + error_msg = _('Unknown column: %s') % label + raise SheetImportException(error_msg) + else: + columns = [d.copy() for d in profile_columns] + for col in columns: + del col['name'] # Unify with first row version + + return columns + + +def get_fields_and_types(columns): + fields = [col['field'] for col in columns] + types = [col['type'] for col in columns] + return fields, types + + +def parse_date(value): + if isinstance(value, basestring): + date_time = dateutil.parser.parse(value, dayfirst=True) + else: + date_time = value + + return date_time.date() + + +def convert_value(value, type, row_number): + converters = { + 'text': lambda x: x, + 'date': parse_date, + 'integer': lambda x: int(x), + 'number': lambda x: Decimal(x) + } + if type not in converters: + raise SheetImportException( + _(u"Unknown data type '%s' on row %d ") % (type, row_number)) + try: + return converters[type](value) + except: + raise SheetImportException( + _(u"Can not process value '%s' of type '%s' on row %d ") % + (value, type, row_number)) + + +def normalize_row(raw_row): + # Unify difference between CSV and openpyxl cells + return [getattr(v, 'value', v) for v in raw_row] + + +def process_rows(rows, profile_columns, skip_header=False): + # If there is no header (skip_header=False), then use profile's order of + # columns, otherwise use header line to check mapping and define order + first_row = normalize_row(rows.next()) if skip_header else None + columns = order_columns(profile_columns, first_row) + # comumns = [{'field': "...", 'type': "..."}, ...] + + objects = [] + for i, row in enumerate(rows, 2 if first_row else 1): + objects.append(process_row(row, columns, i)) + return objects + + +def process_row(row, columns, row_no): + values = normalize_row(row) + return reduce( + lambda object_dict, converter: converter.add_to(object_dict), + map(CellConverter, values, columns), + {} + ) + + +class CellConverter(object): + def __init__(self, value, col_spec): + self.value = value + self.type = col_spec['type'] + self.field = col_spec['field'] + + def add_to(self, object_dict): + if self.type != 'ignore': + object_dict[self.field] = convert_value(self.value) + return object_dict + + +def save_rows(objects, data_type): + for obj in objects: + pass + return len(objects) + + +def store_spreadsheet(label, fobject): + profile = get_profile(label) + + file_format = profile.get('format') + skip_header = profile.get('skip_header', False) + + rows = get_rows_iterator(fobject, file_format) + items = process_rows(rows, profile['columns'], skip_header) + return save_rows(items, 'message') diff --git a/django/website/chn_spreadsheet/tests.py b/django/website/chn_spreadsheet/tests.py index b70cde7204a125ac773a7336064b223c94c688d2..09fb521b65e319c9ba02dcc75b08a1e01c5fa141 100644 --- a/django/website/chn_spreadsheet/tests.py +++ b/django/website/chn_spreadsheet/tests.py @@ -176,10 +176,6 @@ def test_normalize_row_differences(): assert result == [5, 'London', '1.1.2015'] -def test_normalize_row_works_with_none(): - assert normalize_row(None) is None - - def __test_process_rows_without_or_with_header(with_header): def _rows_generator(): rows = [ diff --git a/django/website/chn_spreadsheet/utils.py b/django/website/chn_spreadsheet/utils.py index 32869d53a4e3e82accb62bd14c8237e5a279ec8b..39b707bd9df263089c1511722693cd07683e49bb 100644 --- a/django/website/chn_spreadsheet/utils.py +++ b/django/website/chn_spreadsheet/utils.py @@ -144,28 +144,21 @@ def convert_row(orig_values, types, row_number): def normalize_row(raw_row): # Unify difference between CSV and openpyxl cells - if raw_row: - row = [] - for val in raw_row: - value = val.value if hasattr(val, "value") else val - row.append(value) - return row - return None + return [getattr(v, 'value', v) for v in raw_row] def process_rows(rows, profile_columns, skip_header=False): # If there is no header (skip_header=False), then use profile's order of # columns, otherwise use header line to check mapping and define order - first_row = rows.next() if skip_header else None - columns = order_columns(profile_columns, normalize_row(first_row)) + first_row = normalize_row(rows.next()) if skip_header else None + columns = order_columns(profile_columns, first_row) fields, types = get_fields_and_types(columns) objects = [] - for i, raw_row in enumerate(rows): + for i, raw_row in enumerate(rows, 2 if first_row else 1): row = normalize_row(raw_row) - row_num = i + 2 if first_row else i + 1 - values = convert_row(row, types, row_num) + values = convert_row(row, types, i) obj = dict(zip(fields, values)) objects.append(obj) return objects