Author: echatellier Date: 2012-07-19 16:07:21 +0200 (Thu, 19 Jul 2012) New Revision: 1027 Url: http://forge.codelutin.com/repositories/revision/coser/1027 Log: refs #1321: Control sur valeurs aberrantes utilisant l'?\195?\169cart type mais n'?\195?\169tant pas tout ?\195?\160 fait satisfaisant. Modified: trunk/coser-business/src/main/java/fr/ifremer/coser/services/ControlService.java trunk/coser-business/src/main/resources/i18n/coser-business_en_GB.properties trunk/coser-business/src/main/resources/i18n/coser-business_fr_FR.properties trunk/coser-business/src/test/java/fr/ifremer/coser/services/ControlServiceTest.java trunk/coser-business/src/test/resources/log4j.properties Modified: trunk/coser-business/src/main/java/fr/ifremer/coser/services/ControlService.java =================================================================== --- trunk/coser-business/src/main/java/fr/ifremer/coser/services/ControlService.java 2012-07-19 09:41:56 UTC (rev 1026) +++ trunk/coser-business/src/main/java/fr/ifremer/coser/services/ControlService.java 2012-07-19 14:07:21 UTC (rev 1027) @@ -5,7 +5,7 @@ * $Id$ * $HeadURL$ * %% - * Copyright (C) 2010 - 2011 Ifremer, Codelutin, Chatellier Eric + * Copyright (C) 2010 - 2012 Ifremer, Codelutin, Chatellier Eric * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as @@ -203,13 +203,15 @@ * * @param project project * @param control control a valider - * @param progress progress monitor + * @param progress progress monitor (can be null) * @return les erreurs de validation */ public List<ControlError> validateData(Project project, Control control, ProgressMonitor progress) { - progress.setStepCount(10); - progress.setStep(0); + if (progress != null) { + progress.setStepCount(11); + progress.setStep(0); + } // valide chaque category List<ControlError> validationErrors = new ArrayList<ControlError>(); @@ -220,27 +222,35 @@ if (categoryErrors != null) { validationErrors.addAll(categoryErrors); } - progress.nextStep(); + if (progress != null) { + progress.nextStep(); + } // validation specifique de la category List<ControlError> specificErrors = validateCategorySpecific(control, category, progress); if (specificErrors != null) { validationErrors.addAll(specificErrors); } - progress.nextStep(); + if (progress != null) { + progress.nextStep(); + } } } // validation entre catch et length (specific) List<? extends ControlError> diffCatchLengthErrors = validateDiffCatchLength(control, progress); - progress.nextStep(); + if (progress != null) { + progress.nextStep(); + } if (diffCatchLengthErrors != null) { validationErrors.addAll(diffCatchLengthErrors); } // validation par croisement de fichiers List<ControlError> crossFileErrors = validationCrossFiles(project, control, progress); - progress.nextStep(); + if (progress != null) { + progress.nextStep(); + } if (crossFileErrors != null) { validationErrors.addAll(crossFileErrors); } @@ -261,7 +271,7 @@ * * @param control control a valider * @param category category a valider - * @param progress progress monitor + * @param progress progress monitor (can be null) * @return les erreurs de validation */ public List<ControlError> validateCategory(Control control, Category category, ProgressMonitor progress) { @@ -282,7 +292,7 @@ * * @param control control a valider * @param category category a valider - * @param progress progress monitor + * @param progress progress monitor (can be null) * @return les erreurs de validation (not null) */ public List<ControlError> validateCategoryXWork(Control control, Category category, ProgressMonitor progress) { @@ -485,7 +495,7 @@ * Alerte si Somme(CAPTURES$Nombre par CAPTURES$Annee|Strate|Espece) < nobsmin. * * @param control control - * @param progress progress monitor + * @param progress progress monitor (can be null) * @return error list */ protected List<ControlError> validateCategorySpecificCatch( @@ -579,7 +589,7 @@ * et les nombres dans taille. * * @param control data container - * @param progress progress + * @param progress progress (can be null) * * @see PublicationService#getCompareCatchLengthGraph(Project, AbstractDataContainer, Collection) for details * @see CoserBusinessConfig#getControlDiffCatchLength() for option @@ -753,8 +763,16 @@ /** * Alerte si Somme(TAILLES$Nombre par TAILLES$Annee|Strate|Espece) < nobsmin * + * Warning sur les tailles aberrantes par espèce: + * <ul> + * <li>premiere passe pour calculer l'écart type et la moyenne par espèce + * <li>seconde passe pour détecter les valeurs abérentes (> ecart type) + * </ul> + * + * L'ecarty étant : racine( somme (x - moyenne)^2 / n) + * * @param control control - * @param progress progress + * @param progress progress (can be null) * @return error list */ protected List<ControlError> validateCategorySpecificLength( @@ -772,6 +790,10 @@ Map<String, Double> nombreForKey = new HashMap<String, Double>(); Map<String, String> firstLineForKey = new HashMap<String, String>(); + // Standard deviation + Map<String, Double> lengthSumForSpecies = new HashMap<String, Double>(); + Map<String, Double> lengthCountForSpecies = new HashMap<String, Double>(); + // parcours des elements Iterator<String[]> itTuple = control.getLength().iterator(true); int lineIndex = 1; // skip header @@ -813,20 +835,45 @@ } } catch (NumberFormatException ex) { - // par trop grave, normalement les données deviennet + // par trop grave, normalement les données deviennent // valide au fil de la validation if (log.isWarnEnabled()) { log.warn("Can't parse " + nombreValue + " as double"); } } + + // store lenght for Standard deviation + String lengthValue = tuple[Length.INDEX_LENGTH]; + try { + Double nombre = Double.valueOf(lengthValue); + String key = tuple[Length.INDEX_SPECIES]; + if (lengthSumForSpecies.containsKey(key)) { + Double oldValue = lengthSumForSpecies.get(key); + Double newValue = oldValue + nombre; + lengthSumForSpecies.put(key, newValue); + Double count = lengthCountForSpecies.get(key); + lengthCountForSpecies.put(key, count + 1); + } + else { + lengthSumForSpecies.put(key, nombre); + lengthCountForSpecies.put(key, 1d); + } + } + catch (NumberFormatException ex) { + // par trop grave, normalement les données deviennent + // valide au fil de la validation + if (log.isWarnEnabled()) { + log.warn("Can't parse " + lengthValue + " as double"); + } + } } - // now look for invalid data + // now look for invalid data (observations) for (Map.Entry<String, Double> sumObservation : nombreForKey.entrySet()) { String key = sumObservation.getKey(); Double value = sumObservation.getValue(); if (value < config.getControlNobsmin()) { - + String lineNumber = firstLineForKey.get(key); ControlError error = new ControlError(); @@ -839,6 +886,114 @@ } } + // recherche des valeurs abérrantes + if (progress != null) { + progress.nextStep(); + progress.setTotal(total * 2); + } + + // Standard deviation : calcul de la moyenne + Map<String, Double> avgForSpecies = new HashMap<String, Double>(); + for (Map.Entry<String, Double> entry : lengthSumForSpecies.entrySet()) { + double avg = entry.getValue() / lengthCountForSpecies.get(entry.getKey()); + avgForSpecies.put(entry.getKey(), avg); + } + + // Standard deviation : somme des variances au carré + Map<String, Double> varianceSumForSpecies = new HashMap<String, Double>(); + itTuple = control.getLength().iterator(true); + lineIndex = 1; // skip header + while (itTuple.hasNext()) { + + // update progress + if (progress != null) { + int progressPercent = (int)((double)lineIndex / (double)total * 50.0); + progress.setText(_("coser.business.control.step.lengthdeviation", _(Category.LENGTH.getTranslationKey()), progressPercent)); + progress.setCurrent(lineIndex); + ++lineIndex; + } + + String[] tuple = itTuple.next(); + // store lenght for Standard deviation + String lengthValue = tuple[Length.INDEX_LENGTH]; + try { + String key = tuple[Length.INDEX_SPECIES]; + Double nombre = Double.valueOf(lengthValue); + Double value = Math.pow(nombre - avgForSpecies.get(key), 2); + + if (varianceSumForSpecies.containsKey(key)) { + Double oldValue = varianceSumForSpecies.get(key); + Double newValue = oldValue + value; + varianceSumForSpecies.put(key, newValue); + } + else { + varianceSumForSpecies.put(key, value); + } + } + catch (NumberFormatException ex) { + // par trop grave, normalement les données deviennent + // valide au fil de la validation + if (log.isWarnEnabled()) { + log.warn("Can't parse " + lengthValue + " as double"); + } + } + } + + // Standard deviation : calcul de l'ecart type par espèce + // racine( somme (x - moyenne)^2 / n) + Map<String, Double> deviationForSpecies = new HashMap<String, Double>(); + for (Map.Entry<String, Double> entry : varianceSumForSpecies.entrySet()) { + double avg = Math.sqrt(entry.getValue() / lengthCountForSpecies.get(entry.getKey())); + deviationForSpecies.put(entry.getKey(), avg); + } + + // Standard deviation : recherche des valeurs aberantes : 3 fois l'écart type + itTuple = control.getLength().iterator(true); + lineIndex = 1; // skip header + while (itTuple.hasNext()) { + + // update progress + if (progress != null) { + int progressPercent = (int)((double)lineIndex / (double)total * 50.0 + 50.0); + progress.setText(_("coser.business.control.step.lengthoutliers", _(Category.LENGTH.getTranslationKey()), progressPercent)); + progress.setCurrent(total + lineIndex); + ++lineIndex; + } + + String[] tuple = itTuple.next(); + // store lenght for Standard deviation + String lengthValue = tuple[Length.INDEX_LENGTH]; + try { + String species = tuple[Length.INDEX_SPECIES]; + Double nombre = Double.valueOf(lengthValue); + Double avg = avgForSpecies.get(species); + Double deviation = deviationForSpecies.get(species); + + if (log.isDebugEnabled()) { + log.debug(String.format("Species %s, avg=%f, deviation=%f, value=%f", species, avg, deviation, nombre)); + } + + if (Math.abs(nombre - avg) > deviation * 3) { + String lineNumber = tuple[AbstractDataEntity.INDEX_LINE]; + + ControlError error = new ControlError(); + error.setCategory(Category.LENGTH); + error.setLevel(ValidationLevel.WARNING); + error.addLineNumber(lineNumber); + error.setMessage(_("coser.business.control.error.lengthOutliers", avg, deviation)); + error.setDetailMessage(_("coser.business.control.error.lengthOutliersDetail", species, avg, deviation, lengthValue)); + validationErrors.add(error); + } + } + catch (NumberFormatException ex) { + // par trop grave, normalement les données deviennent + // valide au fil de la validation + if (log.isWarnEnabled()) { + log.warn("Can't parse " + lengthValue + " as double"); + } + } + } + return validationErrors; } Modified: trunk/coser-business/src/main/resources/i18n/coser-business_en_GB.properties =================================================================== --- trunk/coser-business/src/main/resources/i18n/coser-business_en_GB.properties 2012-07-19 09:41:56 UTC (rev 1026) +++ trunk/coser-business/src/main/resources/i18n/coser-business_en_GB.properties 2012-07-19 14:07:21 UTC (rev 1027) @@ -49,6 +49,8 @@ coser.business.control.error.invalidLengthLengthStep=Invalid length step (authorized centimeters and half-centimeters) coser.business.control.error.invalidLengthLengthStepDetail=Invalid length step %s for species %s coser.business.control.error.invalidLengthLengthStepTip=Length field must be in centimeters. +coser.business.control.error.lengthOutliers=Aberrants lengths +coser.business.control.error.lengthOutliersDetail=Species %s \: length %4$s aberrant (average\: %2$.2f, deviation\: %3$.2f) coser.business.control.error.minObservationCount=Minimum number of observation not reached coser.business.control.error.minObservationCountDetail=Minimum number of observation not reached (%s) \: %.2f coser.business.control.error.missingCatchHaulFromHaul=Missing haul file hauls in catch file @@ -92,6 +94,8 @@ coser.business.control.noerrorfound=No error found coser.business.control.step.crossFileChech=Cross file checks (%d%%) coser.business.control.step.diffCatchLength=Check diff catch/length (%d%%) +coser.business.control.step.lengthdeviation=Comptuting standard deviation \: %s (%d%%) +coser.business.control.step.lengthoutliers=Searching for aberrants lengths \: %s (%d%%) coser.business.control.step.observation=Checking observation number \: %s (%d%%) coser.business.control.step.xworks=Line checks \: %s (%d%%) coser.business.extract.creationdate=Creation date \: Modified: trunk/coser-business/src/main/resources/i18n/coser-business_fr_FR.properties =================================================================== --- trunk/coser-business/src/main/resources/i18n/coser-business_fr_FR.properties 2012-07-19 09:41:56 UTC (rev 1026) +++ trunk/coser-business/src/main/resources/i18n/coser-business_fr_FR.properties 2012-07-19 14:07:21 UTC (rev 1027) @@ -49,6 +49,8 @@ coser.business.control.error.invalidLengthLengthStep=Le pas de longueur est invalide (autorisé centimètre et demi-centimètre) coser.business.control.error.invalidLengthLengthStepDetail=Le pas longueur %s est invalide pour l'espèce %s coser.business.control.error.invalidLengthLengthStepTip=Le champs de longueur doit être en centimètre (ou demi centimètre) +coser.business.control.error.lengthOutliers=Longueurs aberrantes +coser.business.control.error.lengthOutliersDetail=Espèce %s \: longueur %4$s aberrantes (moyenne\: %2$.2f, écart\: %3$.2f) coser.business.control.error.minObservationCount=Nombre minimal d'observation non atteint coser.business.control.error.minObservationCountDetail=Nombre minimal d'observation non atteint (%s) \: %.2f coser.business.control.error.missingCatchHaulFromHaul=Traits du fichier traits absents dans le fichier captures @@ -92,6 +94,8 @@ coser.business.control.noerrorfound=Aucune erreur détectée coser.business.control.step.crossFileChech=Vérification par croisement de fichiers (%d%%) coser.business.control.step.diffCatchLength=Vérification des différences captures/taille (%d%%) +coser.business.control.step.lengthdeviation=Calcul de l'écart type \: %s (%d%%) +coser.business.control.step.lengthoutliers=Recherche des longueurs abérrantes \: %s (%d%%) coser.business.control.step.observation=Vérification du nombre d'observation \: %s (%d%%) coser.business.control.step.xworks=Validation par lignes \: %s (%d%%) coser.business.extract.creationdate=Date de création \: Modified: trunk/coser-business/src/test/java/fr/ifremer/coser/services/ControlServiceTest.java =================================================================== --- trunk/coser-business/src/test/java/fr/ifremer/coser/services/ControlServiceTest.java 2012-07-19 09:41:56 UTC (rev 1026) +++ trunk/coser-business/src/test/java/fr/ifremer/coser/services/ControlServiceTest.java 2012-07-19 14:07:21 UTC (rev 1027) @@ -5,7 +5,7 @@ * $Id$ * $HeadURL$ * %% - * Copyright (C) 2010 Ifremer, Codelutin, Chatellier Eric + * Copyright (C) 2010 - 2012 Ifremer, Codelutin, Chatellier Eric * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as @@ -32,8 +32,10 @@ import org.junit.Assert; import org.junit.Test; +import fr.ifremer.coser.CoserBusinessException; import fr.ifremer.coser.CoserConstants.Category; import fr.ifremer.coser.bean.Control; +import fr.ifremer.coser.bean.Project; import fr.ifremer.coser.control.ControlError; import fr.ifremer.coser.data.Catch; import fr.ifremer.coser.data.Haul; @@ -42,7 +44,7 @@ import fr.ifremer.coser.storage.MemoryDataStorage; /** - * Test abour validation. + * Test about control service (validation...). * * @author chatellier * @version $Revision$ @@ -55,6 +57,7 @@ private static final Log log = LogFactory.getLog(ControlServiceTest.class); protected ControlService controlService = new ControlService(config); + protected ProjectService projectService = new ProjectService(config); /** * Test les validations sur les champs vide. @@ -164,4 +167,33 @@ Assert.assertFalse(controlService.isValidFishLength("1.3")); Assert.assertFalse(controlService.isValidFishLength("1.001")); } + + /** + * Test que la validation globales fonctionne. + * Méthode valid data qui passe tout les contôles. + * + * @throws CoserBusinessException + */ + @Test + public void testValidData() throws CoserBusinessException { + Project project = createTestProject(projectService, false); + List<ControlError> errors = controlService.validateData(project, project.getControl(), null); + + // 18 : xxx must contain at least x decimals + // 1 : Duplicated line for key : COSER_TEST|2010|TRAIT1|COSER_SPECIES2|i|NA|19.60| + // 8 : Differences between length and catch for species XXX + // 15 : Missing XXX tuple in catch + // 4 : Missing strata xxx in haul file + // 3 : Missing haul xxx in length file + // Total : 49 + if (log.isInfoEnabled()) { + for (ControlError error : errors) { + log.info(error.getLevel() + " " + error.getDetailMessage()); + } + } + + // il y a beaucoup d'erreurs car les jeux d'essai ne sont + // pas forcements cohérents + Assert.assertEquals(49, errors.size()); + } } Modified: trunk/coser-business/src/test/resources/log4j.properties =================================================================== --- trunk/coser-business/src/test/resources/log4j.properties 2012-07-19 09:41:56 UTC (rev 1026) +++ trunk/coser-business/src/test/resources/log4j.properties 2012-07-19 14:07:21 UTC (rev 1027) @@ -5,7 +5,7 @@ # $Id$ # $HeadURL$ # %% -# Copyright (C) 2010 Ifremer, Codelutin, Chatellier Eric +# Copyright (C) 2010 - 2012 Ifremer, Codelutin, Chatellier Eric # %% # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as @@ -33,4 +33,3 @@ # Categories #log4j.category.fr.ifremer.coser=DEBUG -#log4j.category.com.opensymphony.xwork2=DEBUG