Commit 99252f9c authored by Martin Burchell's avatar Martin Burchell

Merge branch 'ignore-duplicates' into 'staging'

Skip duplicate records when importing

See merge request !116
parents 300bc4a3 6e41d4e2
Pipeline #3153 passed with stage
in 4 minutes and 47 seconds
......@@ -11,7 +11,7 @@ import pytz
from openpyxl import load_workbook
import transport
from transport.exceptions import TransportException
from transport.exceptions import ItemNotUniqueException, TransportException
from .models import SheetProfile
......@@ -132,6 +132,8 @@ class Importer(object):
return {'taxonomy': taxonomy, 'name': name}
def save_rows(self, objects):
num_saved = 0
for obj in objects:
row = obj.pop('_row_number', '')
terms = obj.pop('terms')
......@@ -140,12 +142,18 @@ class Importer(object):
for term in terms:
transport.items.add_terms(
item['id'], term['taxonomy'], term['name'])
except ItemNotUniqueException:
pass
except TransportException as exc_inst:
message = self._get_spreadsheet_error_message(row, exc_inst)
raise SheetImportException(message)
return len(objects)
else:
num_saved += 1
return num_saved
def _get_spreadsheet_error_message(self, row, exc_inst):
status_code = exc_inst.message.pop('status_code')
......@@ -158,8 +166,9 @@ class Importer(object):
for field, errors in exc_inst.message.iteritems():
for error in errors:
messages.append(
_("Column: '{0}'\nError ({1}): '{2}'\n\nValue: {3}").format(
_("Column: '{0}' ({1})\nError ({2}): '{3}'\n\nValue: {4}").format(
field_to_column_map.get(field),
field,
getattr(error, 'code', ''),
six.text_type(error),
item.get(field, '')
......@@ -182,7 +191,9 @@ class Importer(object):
items = self.process_rows(rows)
return self.save_rows(items)
num_saved = self.save_rows(items)
return (num_saved, len(items) - num_saved)
class CellConverter(object):
......
......@@ -23,7 +23,7 @@ def test_etc_items_imported(importer, django_db_setup):
assert len(transport.items.list()) == 0
file_path = path.join(TEST_DIR, 'sample_etc.xlsx')
num_saved = importer.store_spreadsheet('etc', open(file_path, 'rb'))
(num_saved, _) = importer.store_spreadsheet('etc', open(file_path, 'rb'))
assert num_saved > 0
......@@ -34,9 +34,6 @@ def test_etc_items_imported(importer, django_db_setup):
assert items[0]['location'] == 'Camp 1E'
assert isinstance(items[0]['timestamp'], datetime.datetime)
for item in items:
transport.items.create(item)
tags = []
for term in items[0]['terms']:
if term['taxonomy'] == 'tags':
......
......@@ -26,7 +26,7 @@ def test_items_imported(importer):
file_path = path.join(TEST_DIR, 'sample_geopoll.xlsx')
f = open(file_path, 'rb')
num_saved = importer.store_spreadsheet('geopoll', f)
(num_saved, _) = importer.store_spreadsheet('geopoll', f)
assert num_saved > 0
items = transport.items.list()
......
......@@ -408,7 +408,7 @@ def test_save_rows_handles_exception(importer):
assert str(excinfo.value) == (
"There was a problem with row 29 of the spreadsheet:\n"
"Column: 'Ennumerator'\n"
"Column: 'Ennumerator' (enumerator)\n"
"Error (max_length): 'Ensure this field has no more "
"than 200 characters.'\n\n"
"Value: Yakub=Aara smart card no point in "
......@@ -419,3 +419,46 @@ def test_save_rows_handles_exception(importer):
"hota .kinto hetarar aarari forok gorid day ,zodi Burmar shor karotum "
"soyi ensaf takito aarari Thor Sara nohoito"
)
@pytest.mark.django_db
def test_duplicate_records_not_imported(importer):
objects = [
{
'body': "Text",
'timestamp': datetime.datetime(2014, 7, 21),
'enumerator': 'Mohammed',
'terms': [],
'_row_number': 1,
}
]
num_saved = importer.save_rows(objects)
assert num_saved == 1
objects = [
# This one should be ignored the second time around
{
'body': "Text",
'timestamp': datetime.datetime(2014, 7, 21),
'enumerator': 'Mohammed',
'terms': [],
'_row_number': 1,
},
# and this one should be imported
{
'body': "Another bit of Text",
'timestamp': datetime.datetime(2014, 7, 21),
'enumerator': 'Mohammed',
'terms': [],
'_row_number': 2,
}
]
num_saved = importer.save_rows(objects)
assert num_saved == 1
items = transport.items.list()
assert len(items) == 2
......@@ -23,7 +23,7 @@ def test_kobo_master_items_imported(importer, django_db_setup):
assert len(transport.items.list()) == 0
file_path = path.join(TEST_DIR, 'sample_kobo_master.xlsx')
num_saved = importer.store_spreadsheet('kobo_master', open(file_path, 'rb'))
(num_saved, _) = importer.store_spreadsheet('kobo_master', open(file_path, 'rb'))
assert num_saved > 0
......@@ -40,9 +40,6 @@ def test_kobo_master_items_imported(importer, django_db_setup):
assert items[0]['source'] == 'sample source'
assert isinstance(items[0]['timestamp'], datetime.datetime)
for item in items:
transport.items.create(item)
tags = []
for item in items:
for term in item['terms']:
......@@ -54,3 +51,17 @@ def test_kobo_master_items_imported(importer, django_db_setup):
assert all(tag in tags for tag in (
'sample tag',
))
@pytest.mark.django_db # noqa
def test_items_cannot_be_imported_twice(importer, django_db_setup):
file_path = path.join(TEST_DIR, 'master_kobo_single_item.xlsx')
(num_saved, num_skipped) = importer.store_spreadsheet('kobo_master', open(file_path, 'rb'))
assert num_saved == 1
assert num_skipped == 0
(num_saved, num_skipped) = importer.store_spreadsheet('kobo_master', open(file_path, 'rb'))
assert num_saved == 0
assert num_skipped == 1
......@@ -24,7 +24,7 @@ def test_kobo_items_imported(importer, django_db_setup):
assert len(transport.items.list()) == 0
file_path = path.join(TEST_DIR, 'sample_kobo.xlsx')
num_saved = importer.store_spreadsheet('kobo', open(file_path, 'rb'))
(num_saved, _) = importer.store_spreadsheet('kobo', open(file_path, 'rb'))
assert num_saved > 0
......@@ -37,9 +37,6 @@ def test_kobo_items_imported(importer, django_db_setup):
assert items[0]['location'] == 'Camp 4'
assert isinstance(items[0]['timestamp'], datetime.datetime)
for item in items:
transport.items.create(item)
tags = []
for term in items[0]['terms']:
if term['taxonomy'] == 'tags':
......@@ -62,10 +59,3 @@ def test_kobo_empty_body_is_allowed(importer):
assert with_empty_body_item['body'] == ''
assert Item.objects.count() == 3
for item in items:
transport.items.create(with_empty_body_item)
item = Item.objects.last()
assert item.body == ''
......@@ -26,7 +26,7 @@ def test_items_imported(importer):
file_path = path.join(TEST_DIR, 'sample_rapidpro.xlsx')
f = open(file_path, 'rb')
num_saved = importer.store_spreadsheet('rapidpro', f)
(num_saved, _) = importer.store_spreadsheet('rapidpro', f)
assert num_saved > 0
items = transport.items.list()
......
......@@ -459,7 +459,7 @@ def test_table_items_filtered_by_age_range():
})
in_range_2 = transport.items.create({
'body': "In range item 1",
'body': "In range item 2",
'age': '36',
})
......
......@@ -241,6 +241,11 @@ class AddEditItemView(FormView):
item_id,
)
message_code = messages.SUCCESS
except transport.exceptions.ItemNotUniqueException as e:
message = _("This record could not be saved because the body and "
"timestamp clashed with an existing record")
message_code = messages.ERROR
except transport.exceptions.TransportException as e:
message = e.message.get('detail')
if message is None:
......
from django.contrib import messages
from django.http import HttpResponseRedirect
from django.utils.translation import ungettext
from django.utils.translation import gettext, ungettext
from django.views.generic import FormView
from chn_spreadsheet.importer import Importer, SheetImportException
......@@ -22,12 +22,24 @@ class UploadSpreadsheetView(FormView):
try:
importer = Importer()
saved = importer.store_spreadsheet(source, uploaded_file)
msg = ungettext("Upload successful! %d entry has been added.",
"Upload successful! %d entries have been added.",
saved) % saved
messages.success(self.request, msg)
(saved, skipped) = importer.store_spreadsheet(
source, uploaded_file
)
all_messages = [
gettext("Upload successful!"),
ungettext("%d entry has been added.",
"%d entries have been added.",
saved) % saved
]
if skipped > 0:
all_messages.append(
ungettext("%d duplicate entry was skipped.",
"%d duplicate entries were skipped.",
skipped) % skipped
)
messages.success(self.request, ' '.join(all_messages))
except SheetImportException as exc:
msg = exc.message
messages.error(self.request, msg)
......
from django.utils import six
from django.utils.dateparse import parse_datetime
from rest_framework import serializers
class IgnoreMicrosecondsDateTimeField(serializers.DateTimeField):
def to_internal_value(self, value):
"""Truncate microseconds from the timestamp field
MySQL < 5.6 does not store microseconds for the
timestamp field so we need to remove these for the
UniqueTogether validator in the ItemSerializer to
work properly.
This will not be required when we're using MySQL >= 5.6
everywhere.
"""
if isinstance(value, six.string_types):
try:
parsed = parse_datetime(value)
value = parsed.replace(microsecond=0)
except (ValueError, TypeError):
pass
return super(IgnoreMicrosecondsDateTimeField, self).to_internal_value(value)
from __future__ import absolute_import, unicode_literals
from rest_framework import serializers
from rest_framework import serializers, validators
from data_layer.models import Item
from taxonomies.models import Taxonomy, Term
from .fields import IgnoreMicrosecondsDateTimeField
class TaxonomySerializer(serializers.ModelSerializer):
......@@ -67,7 +69,14 @@ class ItemSerializer(serializers.ModelSerializer):
fields = (
'__all__'
)
validators = [
validators.UniqueTogetherValidator(
queryset=Item.objects.all(),
fields=('body', 'timestamp')
)
]
timestamp = IgnoreMicrosecondsDateTimeField()
terms = TermSerializer(many=True, required=False)
def create(self, validated_data):
......
from __future__ import absolute_import, unicode_literals
import datetime
from django.utils import timezone
import pytest
......@@ -10,6 +12,9 @@ from rest_api.views import ItemViewSet
def create_item(**kwargs):
if 'timestamp' not in kwargs:
kwargs['timestamp'] = datetime.datetime.now()
request = APIRequestFactory().post('/items', kwargs)
view = ItemViewSet.as_view(actions={'post': 'create'})
response = view(request)
......
from __future__ import absolute_import, unicode_literals
from datetime import datetime
import pytest
from rest_framework import status
from rest_framework.test import APIRequestFactory
......@@ -13,6 +15,9 @@ from .taxonomy_and_term_create_tests import create_taxonomy
def update_item(id, **kwargs):
if 'timestamp' not in kwargs:
kwargs['timestamp'] = datetime.now()
url = '/items/%d' % id # This doesn't seem to matter if id is absent?
request = APIRequestFactory().put(url, kwargs)
view = ItemViewSet.as_view(actions={'put': 'update'})
......@@ -25,10 +30,14 @@ def update_item(id, **kwargs):
@pytest.mark.django_db
def test_item_fields_can_be_updated():
old_data = {'body': 'That the government is using this Ebola as a business to inrich few governmemt official',
'network_provider': '8737 (Lonestar)'}
new_data = {'body': 'That the government is using this Ebola as a business to inrich few government official',
'network_provider': '8737 (CellCom)'}
old_data = {
'body': 'That the government is using this Ebola as a business to inrich few governmemt official',
'network_provider': '8737 (Lonestar)'
}
new_data = {
'body': 'That the government is using this Ebola as a business to inrich few government official',
'network_provider': '8737 (CellCom)',
}
response = create_item(**old_data)
id = response.data['id']
......
import datetime
import pytest
from rest_framework.exceptions import ValidationError
import transport
from ..serializers import ItemSerializer
@pytest.mark.django_db
def test_serialized_timestamp_ignores_microseconds():
data = {
'body': "Test",
'timestamp': datetime.datetime.now().replace(microsecond=123)
}
transport.items.create(data)
data['timestamp'] = data['timestamp'].replace(microsecond=0)
serializer = ItemSerializer(data=data)
with pytest.raises(ValidationError) as e:
serializer.is_valid(raise_exception=True)
class ItemNotUniqueException(Exception):
pass
class TransportException(Exception):
pass
from datetime import datetime
from django.utils.dateparse import parse_datetime
from rest_framework import status
......@@ -5,7 +7,7 @@ from rest_framework.test import APIRequestFactory
from rest_api.views import ItemViewSet
from .exceptions import TransportException
from .exceptions import ItemNotUniqueException, TransportException
request_factory = APIRequestFactory()
......@@ -59,15 +61,23 @@ def get(id):
def create(item):
""" Create an Item from the given dict """
if 'timestamp' not in item:
item['timestamp'] = datetime.now()
view = get_view({'post': 'create'})
request = request_factory.post("", item)
response = view(request)
if status.is_success(response.status_code):
return response.data
else:
response.data['status_code'] = response.status_code
response.data['item'] = item
raise TransportException(response.data)
response.data['status_code'] = response.status_code
response.data['item'] = item
if _item_not_unique(response):
raise ItemNotUniqueException(response.data)
raise TransportException(response.data)
def update(id, item):
......@@ -77,9 +87,28 @@ def update(id, item):
response = view(request, pk=id)
if status.is_success(response.status_code):
return response.data
else:
response.data['status_code'] = response.status_code
raise TransportException(response.data)
response.data['status_code'] = response.status_code
response.data['item'] = item
if _item_not_unique(response):
raise ItemNotUniqueException(response.data)
raise TransportException(response.data)
def _item_not_unique(response):
if response.status_code != status.HTTP_400_BAD_REQUEST:
return False
if 'non_field_errors' not in response.data:
return False
for error in response.data['non_field_errors']:
if error.code == 'unique':
return True
return False
def delete(id):
......
from __future__ import absolute_import, unicode_literals
from datetime import datetime
from django.utils import timezone
import pytest
......@@ -23,3 +25,15 @@ def test_create_item_creates_item(now):
assert 'id' in response
new_count = len(items.list())
assert new_count > old_count
@pytest.mark.django_db
def test_timestamp_ignores_microseconds():
timestamp = datetime(year=2018, month=12, day=21, hour=13, minute=59,
second=1, microsecond=123)
item = {'body': "Text", 'timestamp': timestamp}
response = items.create(item)
assert response['timestamp'] == '2018-12-21T13:59:01Z'
from __future__ import absolute_import, unicode_literals
import datetime
import pytest
from transport import items
from ..exceptions import ItemNotUniqueException
@pytest.mark.django_db
def test_update_item_updates_item():
......@@ -20,3 +24,45 @@ def test_update_item_updates_item():
[updated_item] = items.list(body="Updated text")
assert updated_item['id'] == id
@pytest.mark.django_db
def test_cannot_update_item_to_have_non_unique_body_and_timestamp():
item_1 = {
'timestamp': datetime.datetime.now(),
'body': "Text"
}
item_2 = {
'timestamp': datetime.datetime.now(),
'body': "Text 2"
}
item_1 = items.create(item_1)
item_2 = items.create(item_2)
item_2['timestamp'] = item_1['timestamp']
item_2['body'] = item_1['body']
id = item_2.pop('id')
with pytest.raises(ItemNotUniqueException):
items.update(id, item_2)
@pytest.mark.django_db
def test_timestamp_ignores_microseconds():
timestamp = datetime.datetime(
year=2018, month=12, day=21,
hour=13, minute=59, second=1, microsecond=123
)
data = {'body': "Text", 'timestamp': timestamp}
item = items.create(data)
data['timestamp'] = timestamp.replace(microsecond=456)
response = items.update(item['id'], data)
assert response['timestamp'] == '2018-12-21T13:59:01Z'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment