Skip to content
Snippets Groups Projects
Commit 1ebb1afd authored by Marko Samastur's avatar Marko Samastur
Browse files

Merge branch 'staging' into batch_categories_ui

parents 0312f201 90e2cc69
No related branches found
No related tags found
No related merge requests found
......@@ -13,7 +13,7 @@ MySQL-python==1.2.5
pytest-django==2.8.0
mock
py==1.4.29
django-jenkins==0.17.0
......@@ -26,7 +26,7 @@ python-dateutil==2.4.2
### Apps Aptivate likes that we may want to use
## ---------Search
## ---------Search
#django-haystack
#pyelasticsearch
......
import dateutil.parser
from decimal import Decimal
import datetime
import pytz
import sys
from django.utils.timezone import is_naive
from django.utils.translation import ugettext as _
from openpyxl import load_workbook
from transport.data_layer_transport import create_message
from .models import SheetProfile
class SheetImportException(Exception):
pass
class Importer(object):
def get_profile(self, label):
try:
sheet_profile = SheetProfile.objects.get(label=label)
except SheetProfile.DoesNotExist:
error_msg = _('Misconfigured service. Source "%s" does not exist') % label
raise SheetImportException(error_msg)
# TODO: Revert to using database
# return sheet_profile.profile
return {
"label": "geopoll",
"name": "Geopoll",
"format": "excel",
"type": "message",
"columns": [
{
"name": "Province",
"type": "ignore",
"field": "ignore"
},
{
"name": "CreatedDate",
"type": "date",
"field": "timestamp",
"date_format": "%m/%d/%y"
},
{
"name": "AgeGroup",
"type": "ignore",
"field": "ignore"
},
{
"name": "QuestIO",
"type": "text",
"field": "body"
}
],
"skip_header": 1
}
def get_columns_map(self, col_list):
'''This function assumes that column names are unique for spreadsheet.
If they are not, then you already have a problem.'''
columns_map = {}
for column in col_list:
col_dict = {
'type': column['type'],
'field': column['field']}
if 'date_format' in column:
col_dict['date_format'] = column['date_format']
columns_map[column['name']] = col_dict
return columns_map
def get_rows_iterator(self, spreadsheet, file_format):
if file_format == 'excel':
try:
wb = load_workbook(spreadsheet, read_only=True)
ws = wb[wb.sheetnames[0]]
except:
error_msg = _('Expected excel file. Received file in an unrecognized format.')
raise SheetImportException(error_msg)
rows = ws.rows
else:
error_msg = _('Unsupported file format: %s') % file_format
raise SheetImportException(error_msg)
return rows
def order_columns(self, profile_columns, first_row=None):
columns = []
if first_row:
col_map = self.get_columns_map(profile_columns)
for label in first_row:
try:
columns.append(col_map[label])
except:
error_msg = _('Unknown column: %s') % label
raise SheetImportException(error_msg)
else:
columns = [d.copy() for d in profile_columns]
for col in columns:
del col['name'] # Unify with first row version
return columns
def get_fields_and_types(self, columns):
fields = [col['field'] for col in columns]
types = [col['type'] for col in columns]
return fields, types
def normalize_row(self, raw_row):
# Unify difference between CSV and openpyxl cells
return [getattr(v, 'value', v) for v in raw_row]
def process_rows(self, rows, profile_columns, skip_header=False):
# If there is no header (skip_header=False), then use profile's order of
# columns, otherwise use header line to check mapping and define order
first_row = self.normalize_row(rows.next()) if skip_header else None
columns = self.order_columns(profile_columns, first_row)
# columns = [{'field': "...", 'type': "..."}, ...]
objects = []
for i, row in enumerate(rows, 2 if first_row else 1):
try:
objects.append(self.process_row(row, columns))
except SheetImportException as e:
raise type(e), type(e)(e.message +
'in row %d ' % i), sys.exc_info()[2]
return objects
def process_row(self, row, columns):
values = self.normalize_row(row)
return reduce(
lambda object_dict, converter: converter.add_to(object_dict),
[CellConverter(val, col) for val, col in zip(values, columns)],
{}
)
def save_rows(self, objects, data_type):
for obj in objects:
create_message(obj)
return len(objects)
def store_spreadsheet(self, label, fobject):
profile = self.get_profile(label)
file_format = profile.get('format')
skip_header = profile.get('skip_header', False)
rows = self.get_rows_iterator(fobject, file_format)
items = self.process_rows(rows, profile['columns'], skip_header)
return self.save_rows(items, 'message')
class CellConverter(object):
def __init__(self, value, col_spec):
self.value = value
self.type = col_spec['type']
self.field = col_spec['field']
self.date_format = col_spec.get('date_format', None)
def add_to(self, object_dict):
if self.type != 'ignore':
object_dict[self.field] = self.convert_value()
return object_dict
def convert_value(self):
converters = {
'date': lambda x: self.convert_date(),
'text': lambda x: x,
'integer': lambda x: int(x),
'number': lambda x: Decimal(x)
}
if self.type not in converters:
raise SheetImportException(
_(u"Unknown data type '%s' ") % (self.type))
try:
return converters[self.type](self.value)
except Exception as e:
message = _("%s\nCan not process value '%s' of type '%s' ") % (e.message, self.value, self.type)
raise SheetImportException(message), None, sys.exc_info()[2]
def convert_date(self):
if isinstance(self.value, basestring):
date_time = self.parse_date()
else:
date_time = self.value
if is_naive(date_time):
date_time = pytz.utc.localize(date_time)
return date_time
def parse_date(self):
if self.date_format is None:
raise SheetImportException(
_(u"Date format not specified for '%s' ") %
(self.field))
try:
date_time = datetime.datetime.strptime(self.value,
self.date_format)
except:
date_time = dateutil.parser.parse(self.value)
return date_time
......@@ -2,14 +2,12 @@ import datetime
import decimal
from os import path
import pytest
import pytz
from django.utils.translation import ugettext as _
from .utils import (
convert_value,
get_profile, get_columns_map, order_columns, get_fields_and_types,
parse_date, normalize_row, get_rows_iterator, process_row, process_rows,
store_spreadsheet,
from .importer import (
CellConverter, Importer,
SheetProfile, SheetImportException
)
......@@ -34,26 +32,32 @@ COLUMN_LIST = [
]
@pytest.fixture
def importer():
return Importer()
@pytest.mark.django_db
@pytest.mark.xfail
def test_get_profile_returns_profile():
def test_get_profile_returns_profile(importer):
label = "unknownpoll"
profile = {'name': 'Empty profile'}
SheetProfile.objects.create(label=label, profile=profile)
sprofile = get_profile(label)
sprofile = importer.get_profile(label)
assert sprofile == profile
@pytest.mark.django_db
def test_get_profile_raises_on_unknown_label():
def test_get_profile_raises_on_unknown_label(importer):
with pytest.raises(SheetImportException) as excinfo:
get_profile('unknownlabel')
importer.get_profile('unknownlabel')
assert excinfo.value.message == _('Misconfigured service. Source "unknownlabel" does not exist')
def test_get_columns_map():
def test_get_columns_map(importer):
expected_result = {
'Province': {
'type': 'location',
......@@ -64,24 +68,28 @@ def test_get_columns_map():
'field': 'message.content'
},
}
result = get_columns_map(COLUMN_LIST)
result = importer.get_columns_map(COLUMN_LIST)
assert result == expected_result
def test_get_rows_iterator_raises_on_non_excel_files():
def test_get_rows_iterator_raises_on_non_excel_files(importer):
with pytest.raises(SheetImportException) as excinfo:
get_rows_iterator('not_a_file', 'excel')
importer.get_rows_iterator('not_a_file', 'excel')
assert excinfo.value.message == _('Expected excel file. Received file in an unrecognized format.')
with pytest.raises(SheetImportException) as excinfo:
get_rows_iterator(None, 'pdf')
importer.get_rows_iterator(None, 'pdf')
assert excinfo.value.message == _('Unsupported file format: pdf')
def test_get_rows_iterator_works_on_excel_files():
def test_get_rows_iterator_works_on_excel_files(importer):
file_path = path.join(TEST_DIR, 'sample_excel.xlsx')
f = open(file_path, 'rb')
rows = list(get_rows_iterator(f, 'excel'))
rows = list(importer.get_rows_iterator(f, 'excel'))
# 2x2 spreadsheet
assert len(rows) == 2
......@@ -96,22 +104,23 @@ def _make_columns_row(column_list):
return row
def test_order_columns_with_no_first_row_returns_original_order():
def test_order_columns_with_no_first_row_returns_original_order(importer):
expected = _make_columns_row(COLUMN_LIST)
ordered = order_columns(COLUMN_LIST)
ordered = importer.order_columns(COLUMN_LIST)
assert ordered == expected
def test_order_columns_with_first_row_return_first_row_order():
def test_order_columns_with_first_row_return_first_row_order(importer):
cleaned = _make_columns_row(COLUMN_LIST)
first_row = ['Message', 'Province']
ordered = order_columns(COLUMN_LIST, first_row)
ordered = importer.order_columns(COLUMN_LIST, first_row)
assert ordered == [cleaned[1], cleaned[0]]
def test_get_fields_and_types():
fields, types = get_fields_and_types(COLUMN_LIST)
def test_get_fields_and_types(importer):
fields, types = importer.get_fields_and_types(COLUMN_LIST)
expected_types = ['location', 'text']
expected_fields = ['message.location', 'message.content']
......@@ -119,30 +128,39 @@ def test_get_fields_and_types():
assert types == expected_types
def test_successful_runs_of_parse_date():
def test_successful_runs_of_parse_date(importer):
dates = (
'05/01/2015',
'5.1.2015',
'5/1/15',
'05-01-2015',
datetime.datetime(2015, 1, 5, 0, 0)
('05/01/2015', '%d/%m/%Y'),
('5.1.2015', '%d.%m.%Y'),
('5/1/15', '%d/%m/%y'),
('05-01-2015', '%d-%m-%Y'),
(datetime.datetime(2015, 1, 5, 0, 0), None)
)
expected = datetime.date(2015, 1, 5)
for date in dates:
assert parse_date(date) == expected
expected = pytz.utc.localize(datetime.datetime(2015, 1, 5))
for date, date_format in dates:
converter = CellConverter(date,
{'type': 'date',
'field': '',
'date_format': date_format})
assert converter.convert_value() == expected
def test_exception_raised_on_faulty_dates():
def test_exception_raised_on_faulty_dates(importer):
bad_date = '05x01-2015'
with pytest.raises(ValueError):
parse_date(bad_date)
with pytest.raises(SheetImportException):
converter = CellConverter(bad_date,
{'type': 'date',
'field': '',
'date_format': '%m-%d-%Y'})
converter.convert_value()
def test_process_row():
def test_process_row(importer):
row = ['Short message', '5', '10.4', '1.5.2015', 'Something else']
number = decimal.Decimal('10.4')
date = datetime.date(2015, 5, 1)
date = pytz.utc.localize(datetime.datetime(2015, 5, 1))
columns = [
{
......@@ -163,7 +181,8 @@ def test_process_row():
{
'name': 'CreatedDate',
'field': 'created',
'type': 'date'
'type': 'date',
'date_format': '%d.%m.%Y'
},
{
'name': 'Province',
......@@ -172,7 +191,7 @@ def test_process_row():
}
]
converted = process_row(row, columns)
converted = importer.process_row(row, columns)
assert converted == {
'message': 'Short message',
'age': 5,
......@@ -181,35 +200,54 @@ def test_process_row():
}
def test_convert_value_raises_on_unknown_type():
def test_convert_value_raises_on_unknown_type(importer):
value = 'Short message'
type = 'location'
converter = CellConverter(value, {'type': type, 'field': ''})
with pytest.raises(SheetImportException) as excinfo:
convert_value(value, type)
converter.convert_value()
assert excinfo.value.message == _(u"Unknown data type 'location' ")
def test_convert_value_raises_on_malformed_value():
def test_convert_value_raises_on_malformed_value(importer):
value = 'not_integer'
type = 'integer'
converter = CellConverter(value, {'type': type, 'field': ''})
with pytest.raises(SheetImportException) as excinfo:
convert_value(value, type)
assert excinfo.value.message == _(u"Can not process value 'not_integer' of type 'integer' ")
converter.convert_value()
messages = excinfo.value.message.split('\n')
assert _(u"Can not process value 'not_integer' of type 'integer' ") in messages
def test_convert_value_raises_on_date_without_format(importer):
value = '1.5.2015'
def test_normalize_row_differences():
converter = CellConverter(value, {
'type': 'date',
'field': 'created'})
with pytest.raises(SheetImportException) as excinfo:
converter.convert_value()
messages = excinfo.value.message.split('\n')
assert _(u"Date format not specified for 'created' ") in messages
def test_normalize_row_differences(importer):
class Cell(object):
def __init__(self, value):
self.value = value
row = [5, 'London', Cell('1.1.2015')]
result = normalize_row(row)
result = importer.normalize_row(row)
assert result == [5, 'London', '1.1.2015']
def __test_process_rows_without_or_with_header(with_header):
def __test_process_rows_without_or_with_header(importer, with_header):
def _rows_generator():
rows = [
('Province', 'Message'),
......@@ -225,7 +263,7 @@ def __test_process_rows_without_or_with_header(with_header):
columns[0]['type'] = 'text'
rows = _rows_generator()
objects = process_rows(rows, columns, with_header)
objects = importer.process_rows(rows, columns, with_header)
expected_objects = [
{
'message.location': 'London',
......@@ -240,15 +278,15 @@ def __test_process_rows_without_or_with_header(with_header):
assert objects == expected_objects
def test_process_rows_without_header():
__test_process_rows_without_or_with_header(False)
def test_process_rows_without_header(importer):
__test_process_rows_without_or_with_header(importer, False)
def test_process_rows_with_header():
__test_process_rows_without_or_with_header(True)
def test_process_rows_with_header(importer):
__test_process_rows_without_or_with_header(importer, True)
def test_process_rows_displays_line_number_on_error():
def test_process_rows_displays_line_number_on_error(importer):
def _rows_generator():
rows = [
('Province', 'Message'),
......@@ -265,22 +303,26 @@ def test_process_rows_displays_line_number_on_error():
with_header = True
with pytest.raises(SheetImportException) as excinfo:
process_rows(rows, columns, with_header)
importer.process_rows(rows, columns, with_header)
assert excinfo.value.message == _(u"Unknown data type 'location' in row 2 ")
assert len(excinfo.traceback) > 2, "Was expecting traceback of more than 2 lines"
@pytest.mark.django_db
def test_items_imported():
def test_items_imported(importer):
items = Message.objects.all()
assert len(items) == 0
file_path = path.join(TEST_DIR, 'sample_geopoll.xlsx')
f = open(file_path, 'rb')
num_saved = store_spreadsheet('geopoll', f)
num_saved = importer.store_spreadsheet('geopoll', f)
assert num_saved > 0
items = Message.objects.all()
assert len(items) > 0
assert items[0].body == "What is the cuse of ebola?"
assert items[0].timestamp == pytz.utc.localize(
datetime.datetime(2015, 5, 1))
import dateutil.parser
from decimal import Decimal
import sys
from django.utils.translation import ugettext as _
from openpyxl import load_workbook
from transport.data_layer_transport import create_message
from .models import SheetProfile
class SheetImportException(Exception):
pass
def get_profile(label):
try:
sheet_profile = SheetProfile.objects.get(label=label)
except SheetProfile.DoesNotExist:
error_msg = _('Misconfigured service. Source "%s" does not exist') % label
raise SheetImportException(error_msg)
# TODO: Revert to using database
# return sheet_profile.profile
return {
"label": "geopoll",
"name": "Geopoll",
"format": "excel",
"type": "message",
"columns": [
{
"name": "Province",
"type": "ignore",
"field": "ignore"
},
{
"name": "CreatedDate",
"type": "date",
"field": "timestamp"
},
{
"name": "AgeGroup",
"type": "ignore",
"field": "ignore"
},
{
"name": "QuestIO",
"type": "text",
"field": "body"
}
],
"skip_header": 1
}
def get_columns_map(col_list):
'''This function assumes that column names are unique for spreadsheet.
If they are not, then you already have a problem.'''
# Python 2.7 (should be faster than a loop)
cols = {
column['name']: {
'type': column['type'],
'field': column['field']
} for column in col_list
}
return cols
def get_rows_iterator(spreadsheet, file_format):
if file_format == 'excel':
try:
wb = load_workbook(spreadsheet, read_only=True)
ws = wb[wb.sheetnames[0]]
except:
error_msg = _('Expected excel file. Received file in an unrecognized format.')
raise SheetImportException(error_msg)
rows = ws.rows
else:
error_msg = _('Unsupported file format: %s') % file_format
raise SheetImportException(error_msg)
return rows
def order_columns(profile_columns, first_row=None):
columns = []
if first_row:
col_map = get_columns_map(profile_columns)
for label in first_row:
try:
columns.append(col_map[label])
except:
error_msg = _('Unknown column: %s') % label
raise SheetImportException(error_msg)
else:
columns = [d.copy() for d in profile_columns]
for col in columns:
del col['name'] # Unify with first row version
return columns
def get_fields_and_types(columns):
fields = [col['field'] for col in columns]
types = [col['type'] for col in columns]
return fields, types
def parse_date(value):
if isinstance(value, basestring):
date_time = dateutil.parser.parse(value, dayfirst=True)
else:
date_time = value
return date_time.date()
def convert_value(value, type):
converters = {
'text': lambda x: x,
'date': parse_date,
'integer': lambda x: int(x),
'number': lambda x: Decimal(x)
}
if type not in converters:
raise SheetImportException(
_(u"Unknown data type '%s' ") % (type))
try:
return converters[type](value)
except:
raise SheetImportException(
_(u"Can not process value '%s' of type '%s' ") %
(value, type))
def normalize_row(raw_row):
# Unify difference between CSV and openpyxl cells
return [getattr(v, 'value', v) for v in raw_row]
def process_rows(rows, profile_columns, skip_header=False):
# If there is no header (skip_header=False), then use profile's order of
# columns, otherwise use header line to check mapping and define order
first_row = normalize_row(rows.next()) if skip_header else None
columns = order_columns(profile_columns, first_row)
# columns = [{'field': "...", 'type': "..."}, ...]
objects = []
for i, row in enumerate(rows, 2 if first_row else 1):
try:
objects.append(process_row(row, columns))
except SheetImportException as e:
raise type(e), type(e)(e.message +
'in row %d ' % i), sys.exc_info()[2]
return objects
def process_row(row, columns):
values = normalize_row(row)
return reduce(
lambda object_dict, converter: converter.add_to(object_dict),
map(CellConverter, values, columns),
{}
)
class CellConverter(object):
def __init__(self, value, col_spec):
self.value = value
self.type = col_spec['type']
self.field = col_spec['field']
def add_to(self, object_dict):
if self.type != 'ignore':
object_dict[self.field] = convert_value(
self.value, self.type)
return object_dict
def save_rows(objects, data_type):
for obj in objects:
create_message(obj)
return len(objects)
def store_spreadsheet(label, fobject):
profile = get_profile(label)
file_format = profile.get('format')
skip_header = profile.get('skip_header', False)
rows = get_rows_iterator(fobject, file_format)
items = process_rows(rows, profile['columns'], skip_header)
return save_rows(items, 'message')
......@@ -8,7 +8,7 @@ from django.views.generic.base import TemplateView
from django_tables2 import SingleTableView
from chn_spreadsheet.utils import store_spreadsheet, SheetImportException
from chn_spreadsheet.importer import Importer, SheetImportException
import transport
from .forms import UploadForm, get_spreadsheet_choices
from .tables import ItemTable
......@@ -46,10 +46,12 @@ class UploadSpreadsheetView(FormView):
uploaded_file = data['file']
try:
saved = store_spreadsheet(source, uploaded_file)
importer = Importer()
saved = importer.store_spreadsheet(source, uploaded_file)
msg = ungettext("Upload successful! %d entry has been added.",
"Upload successful! %d entries have been added.",
saved) % saved
messages.success(self.request, msg)
except SheetImportException as exc:
msg = exc.message
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment