diff --git a/deploy/pip_packages.txt b/deploy/pip_packages.txt index a34bbe8d69b7e2e0cde5096499d5547f081913b1..cec6acc936df4ed71174adeb9306f5aef46db6d2 100644 --- a/deploy/pip_packages.txt +++ b/deploy/pip_packages.txt @@ -13,7 +13,7 @@ MySQL-python==1.2.5 pytest-django==2.8.0 mock - +py==1.4.29 django-jenkins==0.17.0 @@ -26,7 +26,7 @@ python-dateutil==2.4.2 ### Apps Aptivate likes that we may want to use -## ---------Search +## ---------Search #django-haystack #pyelasticsearch diff --git a/django/website/chn_spreadsheet/importer.py b/django/website/chn_spreadsheet/importer.py new file mode 100644 index 0000000000000000000000000000000000000000..2e77e28d3c40a2608ee4ffe727b988ff29135d43 --- /dev/null +++ b/django/website/chn_spreadsheet/importer.py @@ -0,0 +1,216 @@ +import dateutil.parser +from decimal import Decimal +import datetime +import pytz +import sys + +from django.utils.timezone import is_naive +from django.utils.translation import ugettext as _ +from openpyxl import load_workbook + +from transport.data_layer_transport import create_message + +from .models import SheetProfile + + +class SheetImportException(Exception): + pass + + +class Importer(object): + def get_profile(self, label): + try: + sheet_profile = SheetProfile.objects.get(label=label) + except SheetProfile.DoesNotExist: + error_msg = _('Misconfigured service. Source "%s" does not exist') % label + raise SheetImportException(error_msg) + + # TODO: Revert to using database + # return sheet_profile.profile + + return { + "label": "geopoll", + "name": "Geopoll", + "format": "excel", + "type": "message", + "columns": [ + { + "name": "Province", + "type": "ignore", + "field": "ignore" + }, + { + "name": "CreatedDate", + "type": "date", + "field": "timestamp", + "date_format": "%m/%d/%y" + }, + { + "name": "AgeGroup", + "type": "ignore", + "field": "ignore" + }, + { + "name": "QuestIO", + "type": "text", + "field": "body" + } + ], + "skip_header": 1 + } + + def get_columns_map(self, col_list): + '''This function assumes that column names are unique for spreadsheet. + If they are not, then you already have a problem.''' + + columns_map = {} + + for column in col_list: + col_dict = { + 'type': column['type'], + 'field': column['field']} + + if 'date_format' in column: + col_dict['date_format'] = column['date_format'] + + columns_map[column['name']] = col_dict + + return columns_map + + def get_rows_iterator(self, spreadsheet, file_format): + if file_format == 'excel': + try: + wb = load_workbook(spreadsheet, read_only=True) + ws = wb[wb.sheetnames[0]] + except: + error_msg = _('Expected excel file. Received file in an unrecognized format.') + raise SheetImportException(error_msg) + rows = ws.rows + else: + error_msg = _('Unsupported file format: %s') % file_format + raise SheetImportException(error_msg) + + return rows + + def order_columns(self, profile_columns, first_row=None): + columns = [] + if first_row: + col_map = self.get_columns_map(profile_columns) + for label in first_row: + try: + columns.append(col_map[label]) + except: + error_msg = _('Unknown column: %s') % label + raise SheetImportException(error_msg) + else: + columns = [d.copy() for d in profile_columns] + for col in columns: + del col['name'] # Unify with first row version + + return columns + + def get_fields_and_types(self, columns): + fields = [col['field'] for col in columns] + types = [col['type'] for col in columns] + + return fields, types + + def normalize_row(self, raw_row): + # Unify difference between CSV and openpyxl cells + return [getattr(v, 'value', v) for v in raw_row] + + def process_rows(self, rows, profile_columns, skip_header=False): + # If there is no header (skip_header=False), then use profile's order of + # columns, otherwise use header line to check mapping and define order + first_row = self.normalize_row(rows.next()) if skip_header else None + columns = self.order_columns(profile_columns, first_row) + # columns = [{'field': "...", 'type': "..."}, ...] + + objects = [] + for i, row in enumerate(rows, 2 if first_row else 1): + try: + objects.append(self.process_row(row, columns)) + except SheetImportException as e: + raise type(e), type(e)(e.message + + 'in row %d ' % i), sys.exc_info()[2] + + return objects + + def process_row(self, row, columns): + values = self.normalize_row(row) + return reduce( + lambda object_dict, converter: converter.add_to(object_dict), + [CellConverter(val, col) for val, col in zip(values, columns)], + {} + ) + + def save_rows(self, objects, data_type): + for obj in objects: + create_message(obj) + + return len(objects) + + def store_spreadsheet(self, label, fobject): + profile = self.get_profile(label) + + file_format = profile.get('format') + skip_header = profile.get('skip_header', False) + + rows = self.get_rows_iterator(fobject, file_format) + items = self.process_rows(rows, profile['columns'], skip_header) + + return self.save_rows(items, 'message') + + +class CellConverter(object): + def __init__(self, value, col_spec): + self.value = value + self.type = col_spec['type'] + self.field = col_spec['field'] + self.date_format = col_spec.get('date_format', None) + + def add_to(self, object_dict): + if self.type != 'ignore': + object_dict[self.field] = self.convert_value() + return object_dict + + def convert_value(self): + converters = { + 'date': lambda x: self.convert_date(), + 'text': lambda x: x, + 'integer': lambda x: int(x), + 'number': lambda x: Decimal(x) + } + if self.type not in converters: + raise SheetImportException( + _(u"Unknown data type '%s' ") % (self.type)) + try: + return converters[self.type](self.value) + except Exception as e: + message = _("%s\nCan not process value '%s' of type '%s' ") % (e.message, self.value, self.type) + raise SheetImportException(message), None, sys.exc_info()[2] + + def convert_date(self): + if isinstance(self.value, basestring): + date_time = self.parse_date() + else: + date_time = self.value + + if is_naive(date_time): + date_time = pytz.utc.localize(date_time) + + return date_time + + def parse_date(self): + if self.date_format is None: + raise SheetImportException( + _(u"Date format not specified for '%s' ") % + (self.field)) + + try: + date_time = datetime.datetime.strptime(self.value, + self.date_format) + except: + date_time = dateutil.parser.parse(self.value) + + return date_time diff --git a/django/website/chn_spreadsheet/tests.py b/django/website/chn_spreadsheet/tests.py index 6424d1e3968b90aaef25e1c9c41adbf14cbcf37f..685f4fef636153c2336549eaffd48052664ff54e 100644 --- a/django/website/chn_spreadsheet/tests.py +++ b/django/website/chn_spreadsheet/tests.py @@ -2,14 +2,12 @@ import datetime import decimal from os import path import pytest +import pytz from django.utils.translation import ugettext as _ -from .utils import ( - convert_value, - get_profile, get_columns_map, order_columns, get_fields_and_types, - parse_date, normalize_row, get_rows_iterator, process_row, process_rows, - store_spreadsheet, +from .importer import ( + CellConverter, Importer, SheetProfile, SheetImportException ) @@ -34,26 +32,32 @@ COLUMN_LIST = [ ] +@pytest.fixture +def importer(): + return Importer() + + @pytest.mark.django_db @pytest.mark.xfail -def test_get_profile_returns_profile(): +def test_get_profile_returns_profile(importer): label = "unknownpoll" profile = {'name': 'Empty profile'} SheetProfile.objects.create(label=label, profile=profile) - sprofile = get_profile(label) + sprofile = importer.get_profile(label) assert sprofile == profile @pytest.mark.django_db -def test_get_profile_raises_on_unknown_label(): +def test_get_profile_raises_on_unknown_label(importer): + with pytest.raises(SheetImportException) as excinfo: - get_profile('unknownlabel') + importer.get_profile('unknownlabel') assert excinfo.value.message == _('Misconfigured service. Source "unknownlabel" does not exist') -def test_get_columns_map(): +def test_get_columns_map(importer): expected_result = { 'Province': { 'type': 'location', @@ -64,24 +68,28 @@ def test_get_columns_map(): 'field': 'message.content' }, } - result = get_columns_map(COLUMN_LIST) + + result = importer.get_columns_map(COLUMN_LIST) + assert result == expected_result -def test_get_rows_iterator_raises_on_non_excel_files(): +def test_get_rows_iterator_raises_on_non_excel_files(importer): + with pytest.raises(SheetImportException) as excinfo: - get_rows_iterator('not_a_file', 'excel') + importer.get_rows_iterator('not_a_file', 'excel') assert excinfo.value.message == _('Expected excel file. Received file in an unrecognized format.') with pytest.raises(SheetImportException) as excinfo: - get_rows_iterator(None, 'pdf') + importer.get_rows_iterator(None, 'pdf') assert excinfo.value.message == _('Unsupported file format: pdf') -def test_get_rows_iterator_works_on_excel_files(): +def test_get_rows_iterator_works_on_excel_files(importer): + file_path = path.join(TEST_DIR, 'sample_excel.xlsx') f = open(file_path, 'rb') - rows = list(get_rows_iterator(f, 'excel')) + rows = list(importer.get_rows_iterator(f, 'excel')) # 2x2 spreadsheet assert len(rows) == 2 @@ -96,22 +104,23 @@ def _make_columns_row(column_list): return row -def test_order_columns_with_no_first_row_returns_original_order(): +def test_order_columns_with_no_first_row_returns_original_order(importer): expected = _make_columns_row(COLUMN_LIST) - ordered = order_columns(COLUMN_LIST) + ordered = importer.order_columns(COLUMN_LIST) assert ordered == expected -def test_order_columns_with_first_row_return_first_row_order(): +def test_order_columns_with_first_row_return_first_row_order(importer): cleaned = _make_columns_row(COLUMN_LIST) first_row = ['Message', 'Province'] - ordered = order_columns(COLUMN_LIST, first_row) + + ordered = importer.order_columns(COLUMN_LIST, first_row) assert ordered == [cleaned[1], cleaned[0]] -def test_get_fields_and_types(): - fields, types = get_fields_and_types(COLUMN_LIST) +def test_get_fields_and_types(importer): + fields, types = importer.get_fields_and_types(COLUMN_LIST) expected_types = ['location', 'text'] expected_fields = ['message.location', 'message.content'] @@ -119,30 +128,39 @@ def test_get_fields_and_types(): assert types == expected_types -def test_successful_runs_of_parse_date(): +def test_successful_runs_of_parse_date(importer): dates = ( - '05/01/2015', - '5.1.2015', - '5/1/15', - '05-01-2015', - datetime.datetime(2015, 1, 5, 0, 0) + ('05/01/2015', '%d/%m/%Y'), + ('5.1.2015', '%d.%m.%Y'), + ('5/1/15', '%d/%m/%y'), + ('05-01-2015', '%d-%m-%Y'), + (datetime.datetime(2015, 1, 5, 0, 0), None) ) - expected = datetime.date(2015, 1, 5) - for date in dates: - assert parse_date(date) == expected + expected = pytz.utc.localize(datetime.datetime(2015, 1, 5)) + for date, date_format in dates: + converter = CellConverter(date, + {'type': 'date', + 'field': '', + 'date_format': date_format}) + + assert converter.convert_value() == expected -def test_exception_raised_on_faulty_dates(): +def test_exception_raised_on_faulty_dates(importer): bad_date = '05x01-2015' - with pytest.raises(ValueError): - parse_date(bad_date) + with pytest.raises(SheetImportException): + converter = CellConverter(bad_date, + {'type': 'date', + 'field': '', + 'date_format': '%m-%d-%Y'}) + converter.convert_value() -def test_process_row(): +def test_process_row(importer): row = ['Short message', '5', '10.4', '1.5.2015', 'Something else'] number = decimal.Decimal('10.4') - date = datetime.date(2015, 5, 1) + date = pytz.utc.localize(datetime.datetime(2015, 5, 1)) columns = [ { @@ -163,7 +181,8 @@ def test_process_row(): { 'name': 'CreatedDate', 'field': 'created', - 'type': 'date' + 'type': 'date', + 'date_format': '%d.%m.%Y' }, { 'name': 'Province', @@ -172,7 +191,7 @@ def test_process_row(): } ] - converted = process_row(row, columns) + converted = importer.process_row(row, columns) assert converted == { 'message': 'Short message', 'age': 5, @@ -181,35 +200,54 @@ def test_process_row(): } -def test_convert_value_raises_on_unknown_type(): +def test_convert_value_raises_on_unknown_type(importer): value = 'Short message' type = 'location' + converter = CellConverter(value, {'type': type, 'field': ''}) with pytest.raises(SheetImportException) as excinfo: - convert_value(value, type) + converter.convert_value() assert excinfo.value.message == _(u"Unknown data type 'location' ") -def test_convert_value_raises_on_malformed_value(): +def test_convert_value_raises_on_malformed_value(importer): value = 'not_integer' type = 'integer' + converter = CellConverter(value, {'type': type, 'field': ''}) + with pytest.raises(SheetImportException) as excinfo: - convert_value(value, type) - assert excinfo.value.message == _(u"Can not process value 'not_integer' of type 'integer' ") + converter.convert_value() + + messages = excinfo.value.message.split('\n') + assert _(u"Can not process value 'not_integer' of type 'integer' ") in messages + +def test_convert_value_raises_on_date_without_format(importer): + value = '1.5.2015' -def test_normalize_row_differences(): + converter = CellConverter(value, { + 'type': 'date', + 'field': 'created'}) + + with pytest.raises(SheetImportException) as excinfo: + converter.convert_value() + + messages = excinfo.value.message.split('\n') + assert _(u"Date format not specified for 'created' ") in messages + + +def test_normalize_row_differences(importer): class Cell(object): def __init__(self, value): self.value = value row = [5, 'London', Cell('1.1.2015')] - result = normalize_row(row) + result = importer.normalize_row(row) assert result == [5, 'London', '1.1.2015'] -def __test_process_rows_without_or_with_header(with_header): +def __test_process_rows_without_or_with_header(importer, with_header): def _rows_generator(): rows = [ ('Province', 'Message'), @@ -225,7 +263,7 @@ def __test_process_rows_without_or_with_header(with_header): columns[0]['type'] = 'text' rows = _rows_generator() - objects = process_rows(rows, columns, with_header) + objects = importer.process_rows(rows, columns, with_header) expected_objects = [ { 'message.location': 'London', @@ -240,15 +278,15 @@ def __test_process_rows_without_or_with_header(with_header): assert objects == expected_objects -def test_process_rows_without_header(): - __test_process_rows_without_or_with_header(False) +def test_process_rows_without_header(importer): + __test_process_rows_without_or_with_header(importer, False) -def test_process_rows_with_header(): - __test_process_rows_without_or_with_header(True) +def test_process_rows_with_header(importer): + __test_process_rows_without_or_with_header(importer, True) -def test_process_rows_displays_line_number_on_error(): +def test_process_rows_displays_line_number_on_error(importer): def _rows_generator(): rows = [ ('Province', 'Message'), @@ -265,22 +303,26 @@ def test_process_rows_displays_line_number_on_error(): with_header = True with pytest.raises(SheetImportException) as excinfo: - process_rows(rows, columns, with_header) + importer.process_rows(rows, columns, with_header) assert excinfo.value.message == _(u"Unknown data type 'location' in row 2 ") assert len(excinfo.traceback) > 2, "Was expecting traceback of more than 2 lines" @pytest.mark.django_db -def test_items_imported(): +def test_items_imported(importer): items = Message.objects.all() assert len(items) == 0 file_path = path.join(TEST_DIR, 'sample_geopoll.xlsx') f = open(file_path, 'rb') - num_saved = store_spreadsheet('geopoll', f) + num_saved = importer.store_spreadsheet('geopoll', f) assert num_saved > 0 items = Message.objects.all() assert len(items) > 0 + + assert items[0].body == "What is the cuse of ebola?" + assert items[0].timestamp == pytz.utc.localize( + datetime.datetime(2015, 5, 1)) diff --git a/django/website/chn_spreadsheet/utils.py b/django/website/chn_spreadsheet/utils.py deleted file mode 100644 index 42545a0bc1594eb5d6e164b8feb2c7b49891aee4..0000000000000000000000000000000000000000 --- a/django/website/chn_spreadsheet/utils.py +++ /dev/null @@ -1,198 +0,0 @@ -import dateutil.parser -from decimal import Decimal -import sys - -from django.utils.translation import ugettext as _ -from openpyxl import load_workbook - -from transport.data_layer_transport import create_message - -from .models import SheetProfile - - -class SheetImportException(Exception): - pass - - -def get_profile(label): - try: - sheet_profile = SheetProfile.objects.get(label=label) - except SheetProfile.DoesNotExist: - error_msg = _('Misconfigured service. Source "%s" does not exist') % label - raise SheetImportException(error_msg) - - # TODO: Revert to using database - # return sheet_profile.profile - - return { - "label": "geopoll", - "name": "Geopoll", - "format": "excel", - "type": "message", - "columns": [ - { - "name": "Province", - "type": "ignore", - "field": "ignore" - }, - { - "name": "CreatedDate", - "type": "date", - "field": "timestamp" - }, - { - "name": "AgeGroup", - "type": "ignore", - "field": "ignore" - }, - { - "name": "QuestIO", - "type": "text", - "field": "body" - } - ], - "skip_header": 1 - } - - -def get_columns_map(col_list): - '''This function assumes that column names are unique for spreadsheet. - If they are not, then you already have a problem.''' - - # Python 2.7 (should be faster than a loop) - cols = { - column['name']: { - 'type': column['type'], - 'field': column['field'] - } for column in col_list - } - return cols - - -def get_rows_iterator(spreadsheet, file_format): - if file_format == 'excel': - try: - wb = load_workbook(spreadsheet, read_only=True) - ws = wb[wb.sheetnames[0]] - except: - error_msg = _('Expected excel file. Received file in an unrecognized format.') - raise SheetImportException(error_msg) - rows = ws.rows - else: - error_msg = _('Unsupported file format: %s') % file_format - raise SheetImportException(error_msg) - return rows - - -def order_columns(profile_columns, first_row=None): - columns = [] - if first_row: - col_map = get_columns_map(profile_columns) - for label in first_row: - try: - columns.append(col_map[label]) - except: - error_msg = _('Unknown column: %s') % label - raise SheetImportException(error_msg) - else: - columns = [d.copy() for d in profile_columns] - for col in columns: - del col['name'] # Unify with first row version - - return columns - - -def get_fields_and_types(columns): - fields = [col['field'] for col in columns] - types = [col['type'] for col in columns] - return fields, types - - -def parse_date(value): - if isinstance(value, basestring): - date_time = dateutil.parser.parse(value, dayfirst=True) - else: - date_time = value - - return date_time.date() - - -def convert_value(value, type): - converters = { - 'text': lambda x: x, - 'date': parse_date, - 'integer': lambda x: int(x), - 'number': lambda x: Decimal(x) - } - if type not in converters: - raise SheetImportException( - _(u"Unknown data type '%s' ") % (type)) - try: - return converters[type](value) - except: - raise SheetImportException( - _(u"Can not process value '%s' of type '%s' ") % - (value, type)) - - -def normalize_row(raw_row): - # Unify difference between CSV and openpyxl cells - return [getattr(v, 'value', v) for v in raw_row] - - -def process_rows(rows, profile_columns, skip_header=False): - # If there is no header (skip_header=False), then use profile's order of - # columns, otherwise use header line to check mapping and define order - first_row = normalize_row(rows.next()) if skip_header else None - columns = order_columns(profile_columns, first_row) - # columns = [{'field': "...", 'type': "..."}, ...] - - objects = [] - for i, row in enumerate(rows, 2 if first_row else 1): - try: - objects.append(process_row(row, columns)) - except SheetImportException as e: - raise type(e), type(e)(e.message + - 'in row %d ' % i), sys.exc_info()[2] - - return objects - - -def process_row(row, columns): - values = normalize_row(row) - return reduce( - lambda object_dict, converter: converter.add_to(object_dict), - map(CellConverter, values, columns), - {} - ) - - -class CellConverter(object): - def __init__(self, value, col_spec): - self.value = value - self.type = col_spec['type'] - self.field = col_spec['field'] - - def add_to(self, object_dict): - if self.type != 'ignore': - object_dict[self.field] = convert_value( - self.value, self.type) - return object_dict - - -def save_rows(objects, data_type): - for obj in objects: - create_message(obj) - - return len(objects) - - -def store_spreadsheet(label, fobject): - profile = get_profile(label) - - file_format = profile.get('format') - skip_header = profile.get('skip_header', False) - - rows = get_rows_iterator(fobject, file_format) - items = process_rows(rows, profile['columns'], skip_header) - return save_rows(items, 'message') diff --git a/django/website/hid/views.py b/django/website/hid/views.py index ddb3842451be5f7fdd7180ed426ba3443dcca1b7..89b3f6488670a8fc6e4d5d7b74182908d1dffb07 100644 --- a/django/website/hid/views.py +++ b/django/website/hid/views.py @@ -8,7 +8,7 @@ from django.views.generic.base import TemplateView from django_tables2 import SingleTableView -from chn_spreadsheet.utils import store_spreadsheet, SheetImportException +from chn_spreadsheet.importer import Importer, SheetImportException import transport from .forms import UploadForm, get_spreadsheet_choices from .tables import ItemTable @@ -46,10 +46,12 @@ class UploadSpreadsheetView(FormView): uploaded_file = data['file'] try: - saved = store_spreadsheet(source, uploaded_file) + importer = Importer() + saved = importer.store_spreadsheet(source, uploaded_file) msg = ungettext("Upload successful! %d entry has been added.", "Upload successful! %d entries have been added.", saved) % saved + messages.success(self.request, msg) except SheetImportException as exc: msg = exc.message