Skip to content
Snippets Groups Projects
Commit cb5fa0d3 authored by David Büchler's avatar David Büchler
Browse files

Erste funktionale Version der entkoppelten Dublettenermittlung mit Cache Funktionalität

Für jeden Filter muss der Cache separat aktualisiert werden.
Beispiel Person:
Alle ContactIds der Personen werden durchlaufen und einzeln auf Dubletten geprüft. Wurden entsprechende Datensätze gefunden, werden diese nicht mehr behandelt. Die zusammengehörenden Dubletten werden dann, vereint unter einer gemeinsamen ClusterId, in der Datenbank gespeichert.
parent 9f90edfa
No related branches found
No related tags found
No related merge requests found
import("system.logging"); import("system.logging");
import("DuplicateScanner_lib"); import("DuplicateScanner_lib");
var filterName = "PersonenDubletten"; //var filterName = "PersonenDubletten";
//var targetEntity = "Person_entity";
//var values = {FIRSTNAME: "Anja", LASTNAME: "Lindner", GENDER: "f", CONTACTID: ""};
//var resultFields = ["PERSONID", "LASTNAME", "FIRSTNAME"];
//var duplicates = DuplicateScannerUtils.ScanForDuplicates(filterName, targetEntity, values, resultFields);
var filterName = "PersonDuplicates";
var targetEntity = "Person_entity"; var targetEntity = "Person_entity";
var values = {FIRSTNAME: "Anja", LASTNAME: "Lindner", GENDER: "f"}; var resultFieldContactId = ["CONTACTID"];
var resultFields = ["PERSONID", "LASTNAME", "FIRSTNAME"]; var resultFieldsIdFieldName = "CONTACTID";
var queryPersonContactIds = "select CONTACTID, FIRSTNAME, LASTNAME, GENDER from CONTACT"
+ " join PERSON on PERSONID = PERSON_ID";
var tmpFieldsInFilterRay = ["CONTACTID", "FIRSTNAME", "LASTNAME", "GENDER"];
logging.log("in der action -> ");
var duplicates = DuplicateScannerUtils.ScanForDuplicates(filterName, targetEntity, values, resultFields); DuplicateScannerUtils.RebuildDuplicatesCache(filterName, targetEntity, queryPersonContactIds,
\ No newline at end of file tmpFieldsInFilterRay, resultFieldContactId, resultFieldsIdFieldName);
\ No newline at end of file
import("system.util");
import("system.vars"); import("system.vars");
import("system.net"); import("system.net");
import("system.logging"); import("system.logging");
...@@ -12,7 +13,7 @@ import("system.entities"); ...@@ -12,7 +13,7 @@ import("system.entities");
*/ */
function DuplicateScannerUtils() {} function DuplicateScannerUtils() {}
DuplicateScannerUtils.ScanForDuplicates = function(pFilterName, pTargetEntity, pFilterValues, pTargetEntityResultFields, pRecordIdFieldToIgnore, pRecordIdValueToIgnore) DuplicateScannerUtils.ScanForDuplicates = function(pFilterName, pTargetEntity, pFilterValues, pTargetEntityResultFields, pRecordIdFieldToIgnore, pRecordIdValueToIgnore)
{ {
let ignoredRecordFilter = _DuplicateScannerUtils._getIgnoreRecordFilter(pRecordIdFieldToIgnore, pRecordIdValueToIgnore, pTargetEntity); let ignoredRecordFilter = _DuplicateScannerUtils._getIgnoreRecordFilter(pRecordIdFieldToIgnore, pRecordIdValueToIgnore, pTargetEntity);
let configuredFilters = _DuplicateScannerUtils._loadFilters(pFilterName, pTargetEntity); let configuredFilters = _DuplicateScannerUtils._loadFilters(pFilterName, pTargetEntity);
...@@ -34,6 +35,107 @@ DuplicateScannerUtils.ScanForDuplicates = function(pFilterName, pTargetEntity, ...@@ -34,6 +35,107 @@ DuplicateScannerUtils.ScanForDuplicates = function(pFilterName, pTargetEntity,
return possibleDuplicates; return possibleDuplicates;
} }
//DuplicateScannerUtils.ScanAndUpdateResultCache = function()
//{
// let duplicatesRay = DuplicateScannerUtils.ScanForDuplicates();
//
// if(duplicatesRay.length > 0)
// {
// /*
// * Try loading the clusterId instead of using a boolean exists check to determine if theres already an identical cluster.
// * If a same cluster already exists the id of it is directly available to use
// */
// let clusterId = _DuplicateScannerUtils._getIdOfIdenticalCachedDuplicatesCluster(pRecordIdValueToIgnore, duplicatesRay)
// if(clusterId == "")
// {
// // No cluster with identical records exists, therefore a new one has to be created containing the new value as well as the found duplicates
// _DuplicateScannerUtils._createNewCachedDuplicatesCluster(pRecordIdValueToIgnore, duplicatesRay);
// }
// else
// {
// //aktuellen datensatz zu cluster hinzufügen
// _DuplicateScannerUtils._AddRecordsToCachedDuplicatesCluster(pRecordIdValueToIgnore, clusterId);
// }
// }
//}
DuplicateScannerUtils.RemoveFromDuplicatesCache = function(pContactId)
{
}
//Später mal eigentsändiger Serverprozess ohne externe Konfiguration
DuplicateScannerUtils.RebuildDuplicatesCache = function(pFilterName, pTargetEntity,
pQueryTargetRecords, pFilterValues, pTargetEntityResultFields, pRecordIdFieldToIgnore)
{
logging.log("in RebuildDuplicatesCache -> ");
let alreadyIdentifiedIds = [];
let contactIdsToScan = db.table(pQueryTargetRecords);
logging.log("contactIdsToScan -> " + JSON.stringify(contactIdsToScan));
//If the contact id loader query results in no ids, stop.
//No ids should be deleted if an error has been made in this query.
if(contactIdsToScan.length <= 0)
return;
//fie felder für die prüfung muss zusammengebaut werden aus den feldern des filters
//und den dazugehörigen werten des aus der db geladenen datensatzes
_DuplicateScannerUtils._deleteDuplicateClusters();
var duplicatesToInsertQueries = [];
for (b = 0; b < contactIdsToScan.length; b++)
{
logging.log("b -> " + b);
logging.log("indexOf(contactIdsToScan[b] -> " + alreadyIdentifiedIds.indexOf(contactIdsToScan[b]));
//If the current Id has already been identified, continue
if(alreadyIdentifiedIds.indexOf(contactIdsToScan[b][0]) > -1)
continue;
logging.log("contactid noch nicht bearbeitet -> " + contactIdsToScan[b][0]);
let filterValuesObject = {};
for (a = 0; a < pFilterValues.length; a++)
{
logging.log("pFilterValues[a] -> " + pFilterValues[a]);
logging.log("contactIdsToScan[i][a] -> " + contactIdsToScan[b][a]);
filterValuesObject[pFilterValues[a]] = contactIdsToScan[b][a];
logging.log("filterValuesObject[pFilterValues[a]] -> " + filterValuesObject[pFilterValues[a]]);
}
logging.log("filterValuesObject -> " + JSON.stringify(filterValuesObject));
let foundDuplicates = DuplicateScannerUtils.ScanForDuplicates(pFilterName, pTargetEntity,
filterValuesObject, pTargetEntityResultFields, pRecordIdFieldToIgnore, contactIdsToScan[b][0])
logging.log("foundDuplicates -> " + JSON.stringify(foundDuplicates));
if(foundDuplicates.length == 0)
continue;
let foundDuplicateIds = [];
for (let i = 0; i < foundDuplicates.length; i++)
{
logging.log("i -> " + i);
logging.log("foundDuplicates[pRecordIdFieldToIgnore] -> " + foundDuplicates[i][pRecordIdFieldToIgnore]);
foundDuplicateIds.push(foundDuplicates[i][pRecordIdFieldToIgnore]);
}
alreadyIdentifiedIds = alreadyIdentifiedIds.concat(foundDuplicateIds);
//The duplicates list doesn't contain the id which was tested for, therefore it get's added manually
foundDuplicateIds.push(contactIdsToScan[b][0]);
logging.log("foundDuplicates -> " + JSON.stringify(foundDuplicates));
logging.log("foundDuplicateIds -> " + JSON.stringify(foundDuplicateIds));
let insertQueriesRay = _DuplicateScannerUtils._createInsertDuplicatesClusterQuery(foundDuplicateIds)
duplicatesToInsertQueries = duplicatesToInsertQueries.concat(insertQueriesRay);
}
logging.log("duplicatesToInsertQueries -> " + JSON.stringify(duplicatesToInsertQueries));
db.inserts(duplicatesToInsertQueries);
}
DuplicateScannerUtils.MergePerson = function(pSourceContactId, pTargetContactId) DuplicateScannerUtils.MergePerson = function(pSourceContactId, pTargetContactId)
{ {
let updateStatements = []; let updateStatements = [];
...@@ -56,8 +158,20 @@ DuplicateScannerUtils.MergePerson = function(pSourceContactId, pTargetContactId) ...@@ -56,8 +158,20 @@ DuplicateScannerUtils.MergePerson = function(pSourceContactId, pTargetContactId)
return true;//(affectedRows > 0 && deletedRows >= 2); return true;//(affectedRows > 0 && deletedRows >= 2);
} }
//DuplicateScannerUtils._getScanConfigForEntity = function(pTargetEntity)
//{
// switch (pTargetEntity)
// {
// case "Person_entity":
// {
// TargetEntityResultFields:
// }
// break;
// default:
// break;
// }
//
//}
function _DuplicateScannerUtils() {} function _DuplicateScannerUtils() {}
...@@ -69,6 +183,37 @@ var INDEX_TABLE_NAME = 0; ...@@ -69,6 +183,37 @@ var INDEX_TABLE_NAME = 0;
var INDEX_COLUMN_NAME = 1; var INDEX_COLUMN_NAME = 1;
var INDEX_CONDITION = 2; var INDEX_CONDITION = 2;
_DuplicateScannerUtils._getIdOfIdenticalCachedDuplicatesCluster = function (pRecordIdValueToIgnore, duplicatesRay)
{
}
_DuplicateScannerUtils._createInsertDuplicatesClusterQuery = function (pDuplicatesRay)
{
let duplicatesToInsertQueries = [];
let cols = ["ID", "CLUSTERID", "DUPLICATEID"];
let newClusterUid = util.getNewUUID();
for (i = 0; i < pDuplicatesRay.length; i++)
{
let newId = util.getNewUUID();
let valuesToInsert = [newId, newClusterUid, pDuplicatesRay[i]];
duplicatesToInsertQueries.push(["DUPLICATECLUSTERS", cols, null, valuesToInsert]);
}
return duplicatesToInsertQueries;
}
_DuplicateScannerUtils._AddRecordsToCachedDuplicatesCluster = function (pRecordIdToAdd, pClusterId)
{
}
_DuplicateScannerUtils._deleteDuplicateClusters = function ()
{
let queryDropTable = "DELETE FROM DUPLICATECLUSTERS";
db.deleteData("DUPLICATECLUSTERS", "");
}
/* /*
* All records with contactId = sourceContactId get updated, which are not assigned to the same "group" as the targetContactId. * All records with contactId = sourceContactId get updated, which are not assigned to the same "group" as the targetContactId.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment