Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: <base>/sources/alistairewj_at_gmail.com/entry9/pnPreprocess.m (32,846 bytes)
function [ data_processed ] = pnPreprocess(data, verboseFlag)
%PNPREPROCESS	Output data after preprocessing
%   [ data ] = pnPreprocess(data)
% Primary variables:
%   data      - Cell array data, rows = # of patients
%   dataDesc  - Cell array of data fields/descriptions
%   dataFixed - Cell array of demographic data fields/descriptions
%   tmp       - Data for a single parameter, e.g. 'HR'
%   idxRem    - Index for data to be removed at end of the loop
%   idxManip  - Index for data to be manipulated some how in switch block
%   high/low  - Extracted data which is manipulated and re-imputed into tmp

%	Copyright 2012 Alistair Johnson

%	$LastChangedBy: alistair $
%	$LastChangedDate: 2012-05-29 09:01:11 -0400 (Tue, 29 May 2012) $
%	$Revision: 1 $
%	Originally written on PCWIN64 by Alistair Johnson, 25-Apr-2012 01:37:49
%	Contact: alistairewj@gmail.com

if nargin<2
    verboseFlag = false;
end

data_processed = data;
[dataDesc,dataFixed]  = pnDataDescriptions();

%=== Loop through the 3 fixed, demographic fields
for k=1:size(dataFixed,1)
    fn = dataFixed{k,1};
    if verboseFlag; fprintf('\n%%=== %s ===%%\n', fn); end;
    
    [tmp,idx] = pnExtractField(data_processed,fn);
    
    %=== Reset delete indices
    idxRem = [];
    switch fn
        case 'RecordID'
            continue;
        case 'Age'
            idxManip = cellfun(@(x) x>100, tmp(:,4),'UniformOutput',false);
            high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            high = cellfun(@(x) x*0+105, high, 'UniformOutput',false);
            
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = high{m};
            end
            if verboseFlag; fprintf('Replaced %2.0f values of 200 with 105.\n',sum(cell2mat(idxManip))); end;
            %=== Analyze residuals of data, check for bias
            % idxManip = cellfun(@(x) numel(x), tmp(:,4),'UniformOutput',false);
            % tmpDataAnalyze = cellfun(@(x,y) x(1:y-1) - x(y), tmp(:,4), idxManip, 'UniformOutput',false);
            % tmpDataAnalyze = cell2mat(tmpDataAnalyze(cellfun(@(x) ~isempty(x), tmpDataAnalyze)));
            % hist(tmpDataAnalyze,-10:1:10); xlabel('Age (1:end-1) - Age(end)');
            
            idxManip = cellfun(@(x) [true(numel(x)-1,1);false], tmp(:,4),'UniformOutput',false);
            %=== Impute 0s for vector values
            tmp(:,4) = cellfun(@(x,y) x-x.*y, tmp(:,4), idxManip, 'UniformOutput',false);
            
            [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq'); % index ==0 for removal
            
            if verboseFlag; fprintf('Deleted %2.0f vector values, leaving only the end value.\n',N); end;
        case 'Gender'
            idxManip = cellfun(@(x) (x==-1), tmp(:,4),'UniformOutput',false);
            high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            high = cellfun(@(x) NaN, high, 'UniformOutput',false);
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = high{m};
            end
            if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end;
        case 'Height'
            % 1 centimetre = 0.393700787 inches
            % 1 foot = 30.48 centimetres
            % 1 inch = 2.54 centimetres
            idxManip = cellfun(@(x) x==-1, tmp(:,4),'UniformOutput',false);
            low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            low = cellfun(@(x) NaN, low, 'UniformOutput',false);
            
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = low{m};
            end
            if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end;
            
            
            idxManip = cellfun(@(x) x<10, tmp(:,4),'UniformOutput',false);
            low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            low = cellfun(@(x) x*100, low, 'UniformOutput',false);
            
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = low{m};
            end
            if verboseFlag; fprintf('Multipled %2.0f values of <10 by 100 (1.8->180).\n',sum(cell2mat(idxManip))); end;
            
            idxManip = cellfun(@(x) x<25, tmp(:,4),'UniformOutput',false);
            low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            low = cellfun(@(x) x*10, low, 'UniformOutput',false);
            
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = low{m};
            end
            if verboseFlag; fprintf('Multipled %2.0f values of <25 by 10 (18->180).\n',sum(cell2mat(idxManip))); end;
            
            idxManip = cellfun(@(x) x<100, tmp(:,4),'UniformOutput',false);
            low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            low = cellfun(@(x) x*2.54, low, 'UniformOutput',false);
            
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = low{m};
            end
            if verboseFlag; fprintf('Multipled %2.0f values of <100 by 2.2 (81.8->180).\n',sum(cell2mat(idxManip))); end;
            
            
            idxManip = cellfun(@(x) x>1000, tmp(:,4),'UniformOutput',false);
            high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            high = cellfun(@(x) x*0.1, high, 'UniformOutput',false);
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = high{m};
            end
            if verboseFlag; fprintf('Multipled %2.0f values of >1000 by 0.1 (1800->180).\n',sum(cell2mat(idxManip))); end;
            
            idxManip = cellfun(@(x) x>250, tmp(:,4),'UniformOutput',false);
            high = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
            high = cellfun(@(x) x/2.54, high, 'UniformOutput',false);
            for m=1:size(tmp,1)
                tmp{m,4}(idxManip{m}) = high{m};
            end
            if verboseFlag; fprintf('Multipled %2.0f values of >250 by 0.3937 (400->157).\n',sum(cell2mat(idxManip))); end;
            
    end
    %=== Replace data with tmp
    for m=2:size(data,2)
        data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false);
    end
    
    %=== Delete entries flagged for deletion
    if ~isempty(idxRem)
        for m=2:size(data,2)
            data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false);
        end
    end
end

%=== Loop through all the fields, and process each appropriately
for k=1:size(dataDesc,1)
    fn = dataDesc{k,1};
    if verboseFlag; fprintf('\n%%=== %s ===%%\n', fn);
        [tmp,idx] = pnExtractField(data_processed,fn);
        idxRem=[];
        switch fn
            case 'Albumin'
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'ALP'
                %=== Possible conversion errors in ALP, ALT, AST:
                % Listed as measured in IU
                % 1 Katal = 60,000,000 IU
                % 1 micro Katal = 60 IU
                % 1 nano Katal = 0.060 IU
                % *** micro kats are sometimes used.
                
                % Reference range: 30 to 120 IU/L
                if verboseFlag; fprintf('No preprocessing performed - Note: highly skewed. Dependent on age and gender.\n'); end;
            case 'ALT'
                % Reference range: 7 to 56 IU/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'AST'
                % Reference range: 5 to 40 IU/L
                % Very heavy tailed: Normal range 5 to 40 IU/L, 10%% data > 1000 IU/L.
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Bilirubin'
                % Reference range: 0.2 to 1.2 mg/dL
                % 1 mg/dL	== 17.1	µmol/L
                % It is possible to have >36 mg/dL in cases of transplant, etc.
                % Can't unilaterally fix this.
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'BUN'
                % Reference range: 10-20 mg/dl (3.6-71 mmol/liter)
                % 1 mg/dL	== 0.357 mmol/L
                % Can't really convert anything here.
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
                
                if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end;
                %             for m=2:size(tmp,2)
                %                 tmp(:,m) = cellfun(@(x,y) x(y), tmp(:,m), idxManip,'UniformOutput',false);
                %             end
            case 'Cholesterol'
                % Reference range:
                %   Desirable           <200 mg/dl      <5.17 mmol/L
                %   Borderline high      200-239 mg/dl	 5.17-6.18 mmol/L
                %   High                ≥240 mg/dl      ≥6.18 mmol/L
                % 1 mg/dL	== 0.0259	mmol/L
                % Nothing bad here.
                if verboseFlag; fprintf('No preprocessing needed.\n'); end;
            case 'Creatinine'
                % Reference range: 0.6-1.3  mg/dl
                %   <1.5 mg/dl (NEJM)
                %   1 mg/dL	== 88.4	µmol/L
                %===  ~6 is a reasonable maximum, 7.6 is very confident
                % Seems OK.
                if verboseFlag; fprintf('No preprocessing needed.\n'); end;
            case 'DiasABP'
                %=== First, delete '0's since we don't know if it was badly
                % converted or missing
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
                if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end;
                
                [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -1, 'eq');
                if verboseFlag; fprintf('Deleted %2.0f values which were -1.\n',N); end;
                
                idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
                
                idxManip = cellfun(@(x) x>170, tmp(:,4),'UniformOutput',false);
                idxManip2 = cellfun(@(x) x<200, tmp(:,4),'UniformOutput',false);
                idxManip = cellfun(@(x) sum(x), idxManip);
                idxManip2 = cellfun(@(x) sum(~x), idxManip2);
                if verboseFlag; fprintf('There exist %2.0f values between 170-200, which were left as is.\n',sum(idxManip)-sum(idxManip2)); end;
                
                [idxRem2, N] = pnPreprocessRemovalIndices(tmp, idx, 200, 'gt');
                [idxRem] = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false);
                if verboseFlag; fprintf('Deleted %2.0f values above 200.\n',N); end;
            case 'FiO2'
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'GCS'
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Glucose'
                % Reference ranges:
                % Fasting
                %   Normal               75-115 mg/dl	 4.2-6.4 mmol/L
                %   Diabetes mellitus	>125 mg/dl      >7.0 mmol/L
                %   2 Hr post-meal      <120 mg/dl      <6.7 mmol/L
                % 1 mg/dL	== 0.0555	mmol/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'HCO3'
                % Reference ranges: 21-30 mEq/L	21-28 mmol/L
                % 1 mEq/L == 1 mmol/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'HCT'
                % Reference range: 41.0-53.0%
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'HR'
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
                
                [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 299, 'gt');
                idxRem = cellfun(@(x,y) x | y, idxRem, idxRem2, 'UniformOutput',false); % Combine removal indices
                if verboseFlag; fprintf('Deleted %2.0f values which above 299.\n',N); end;
            case 'K'
                % Reference ranges: 3.5-5.0 mEq/L	3.5-5.0 mmol/L
                % 1 mEq/L == 1 mmol/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Lactate' % mmol/L
                % Reference ranges: 5-15 mg/dl	0.6-1.7 mmol/liter
                % 1 mg/dL	== 0.111	mmol/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Mg' % mmol/L
                % Reference ranges: 1.8-3.0 mg/dl	0.8-1.2 mmol/L
                %1 mg/dL	== 0.411 mmol/L
                %1 mEq/L	== 0.50	 mmol/L
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'MAP' % mmHg
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
            case 'MechVent'
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Na' % mEq/L
                % Reference ranges: 136-145 mEq/L	136-145 mmol/L
                % 1 mEq/L == 1 mmol/L
                %=== Interesting spike at 150, possible rounding bias?
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'NIDiasABP' % mmHg
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
            case 'NIMAP' % mmHg
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
                
            case 'NISysABP' % mmHg
                % Values below 1??
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'eq');
                if verboseFlag; fprintf('Deleted %2.0f values which less than 1.\n',N); end;
                
            case 'PaCO2' % mmHg
                % Reference range:
                % 4.7-6.0 kPa
                % 35-45 mmHg
                % 1 kPa == 7.5006 mmHg
                
                idxManip = cellfun(@(x) x<10 & x>1, tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*10, low, 'UniformOutput',false);
                
                if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end;
                
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Replaced %2.0f values between [1,10] which were incorrectly recorded.\n',sum(cell2mat(idxManip))); end;
                
                % Values below 1 must be wrong...
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
                
                
                %             idxManip = cellfun(@(x) x<1, tmp(:,4),'UniformOutput',false);
                %             low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                %             low = cellfun(@(x) x*100, low, 'UniformOutput',false);
                %
                %             for m=1:size(tmp,1)
                %                 tmp{m,4}(idxManip{m}) = low{m};
                %             end
            case 'PaO2' % mmHg
                % Reference range:
                % 11-13 kPa
                % 75-100 mmHg
                % 1 kPa == 7.5006 mmHg
                idxManip = cellfun(@(x) x<20 & x>1, tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*10, low, 'UniformOutput',false);
                
                if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end;
                
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Replaced %2.0f values between [1,20] which were incorrectly recorded.\n',sum(cell2mat(idxManip))); end;
                
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
                
            case 'pH'
                % Reference range:
                %   7.34-7.45 units
                
                %=== This is a mess
                % There are 5 values between 94-100, not sure why. Probably put
                % in the wrong field? Doesn't match with an [H+] conversion
                idxManip = cellfun(@(x) (x>0.65 & x<0.8), tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*7.5006, low, 'UniformOutput',false);
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Multiplied %2.0f values by 10 (0.7->7).\n',sum(cell2mat(idxManip))); end;
                
                idxManip = cellfun(@(x) (x>65 & x<80), tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*0.1, low, 'UniformOutput',false);
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Multiplied %2.0f values by 0.1 (70->7).\n',sum(cell2mat(idxManip))); end;
                
                idxManip = cellfun(@(x) (x>650 & x<800), tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*0.01, low, 'UniformOutput',false);
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Multiplied %2.0f values by 0.01 (700->7).\n',sum(cell2mat(idxManip))); end;
                
                
                %=== Values <6.5 and >0.8 are deleted
                % Note: (there are no values around 0.6-0.75)
                [idxRem] = pnPreprocessRemovalIndices(tmp, idx, 0.8, 'gt');
                [idxRem2] = pnPreprocessRemovalIndices(tmp, idx, 6.5, 'lt');
                idxRem = cellfun(@(x,y) x & y, idxRem, idxRem2, 'UniformOutput',false);
                N = sum(cell2mat(cellfun(@(x) sum(x), idxRem, 'UniformOutput', false)));
                if verboseFlag; fprintf('Deleted %2.0f values which were between [0.8,6.5].\n',N); end;
                
                %=== Values >80 & <650 are deleted
                % Note: (there are no values around 0.6-0.75)
                [idxRem3] = pnPreprocessRemovalIndices(tmp, idx, 80, 'gt');
                [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 650, 'lt');
                idxRem = cellfun(@(x,y,z) (x & y) | z, idxRem3, idxRem4, idxRem, 'UniformOutput',false);
                N = sum(cell2mat(cellfun(@(x,y) sum(x&y), idxRem3, idxRem4, 'UniformOutput', false)));
                if verboseFlag; fprintf('Deleted %2.0f values which were between [80,650].\n',N); end;
            case 'Platelets' % cells/nL
                % Reference range: 150-350 (10e3)/mm^3
                % 1 (10e3)/µL == 1 (10e3)/mm^3 == 1/nL
                % Errors are probably in orders of 1000
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'RespRate' % bpm
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
            case 'SaO2' % %
                % Reference range: 94-100
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'SysABP' % mmHg
                % Reference range:
                % 10-14     kPa
                % 75-105	mmHg
                % 1 kPa == 7.5006 mmHg
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
            case 'Temp'
                %=== Check for Farenheit measurements and incorrect conversions
                %=== There are 129 values which are too low...
                
                %=== Assume some of the values are artefacts
                
                %=== INCORRECTLY CONVERTED F->C
                % 1x range should be ~ [1.5,8], use (1,10)
                idxManip = cellfun(@(x) x<10 & x>1, tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*9/5+32, low, 'UniformOutput',false);
                
                %=== spooky action at a distance
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                
                % 2x range should be ~ [-16.8,-13], use (-17,-13)
                idxManip2 = cellfun(@(x) x<=-13 & x>-17, low, 'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip2,'UniformOutput',false);
                low = cellfun(@(x) (x*9/5+32)*9/5+32, low, 'UniformOutput',false);
                
                %=== spooky action at a distance
                for m=1:size(tmp,1)
                    tmp{m}(idxManip2{m}) = low{m};
                end
                
                
                %=== RECORDED AS F
                % 1x range should be ~ [95,113]
                idxManip3 = cellfun(@(x) x>90 & x<120, tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip3,'UniformOutput',false);
                low = cellfun(@(x) (x-32)*5/9, low, 'UniformOutput',false);
                
                %=== spooky action at a distance
                for m=1:size(tmp,1)
                    tmp{m}(idxManip3{m}) = low{m};
                end
                
                
                %=== DELETING VALUES NOW ===%
                %=== Delete 0s and values below -17s (0s and converted 0s)
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
                if verboseFlag; fprintf('Deleted %2.0f values which were equal to 0.\n',N); end;
                
                [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, -17, 'lt');
                if verboseFlag; fprintf('%2.0f of those values were -17.8 (i.e. incorrectly converted from 0).\n',N); end;
                
                %=== Negative values which don't seem erroneously converted
                [idxRem3] = pnPreprocessRemovalIndices(tmp, idx, -13, 'ge');
                [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 1, 'le');
                idxRem3 = cellfun(@(x,y) x & y, idxRem3, idxRem4, 'UniformOutput',false);
                N = sum(cell2mat(cellfun(@(x) sum(x), idxRem3, 'UniformOutput', false)));
                if verboseFlag; fprintf('Deleted %2.0f values which were between [-13,1].\n',N); end;
                
                
                %=== Positive values which don't seem erroneously converted
                [idxRem4] = pnPreprocessRemovalIndices(tmp, idx, 50, 'ge');
                [idxRem5] = pnPreprocessRemovalIndices(tmp, idx, 90, 'le');
                idxRem4 = cellfun(@(x,y) x & y, idxRem4, idxRem5, 'UniformOutput',false);
                N = sum(cell2mat(cellfun(@(x) sum(x), idxRem4, 'UniformOutput', false)));
                if verboseFlag; fprintf('Deleted %2.0f values which were between [50,90].\n',N); end;
                
                [idxRem5,N] = pnPreprocessRemovalIndices(tmp, idx, 119, 'gt');
                if verboseFlag; fprintf('Deleted %2.0f values which were between above 119.\n',N); end;
                
                
                idxRem = cellfun(@(l,m,f,a,o) l | m | f | a | o, idxRem, idxRem2, idxRem3, idxRem4, idxRem5, 'UniformOutput',false);
                
                if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly converted F->C.\n',sum(cell2mat(idxManip))); end;
                if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly converted F->C twice.\n',sum(cell2mat(idxManip2))); end;
                if verboseFlag; fprintf('Replaced %2.0f values which were incorrectly recorded as F.\n',sum(cell2mat(idxManip2))); end;
            
            case 'TroponinI'
                % Reference ranges: 0-0.4 µg/L
                % Could be off by orders of 10
                idxManip = cellfun(@(x) (x>30), tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*0.1, low, 'UniformOutput',false);
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Multiplied %2.0f values by 0.11 (30->0.3).\n',sum(cell2mat(idxManip))); end;
            case 'TroponinT'
                % Reference ranges: 0-0.1 µg/L
                % Could be off by orders of 10.
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'Urine'
                if verboseFlag; fprintf('No preprocessing performed.\n'); end;
            case 'WBC' % cells/nL
                % Reference range: 4.5-11.0
                % 1 (10^3)/µL	== 1 (10^9)/L
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 1, 'lt');
                
                idxManip = cellfun(@(x) x<1 & x>0, tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) x*10, low, 'UniformOutput',false);
                
                if verboseFlag; pnPreprocessPrintReplacedValues(tmp(:,4),low,idxManip); end;
                
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 1.\n',N); end;
            case 'Weight' % kg
                % Reference range:
                %   86.6 kg (190.9 lb) (males)
                %   74.4 kg (164.0 lb) (females)
                % 1 kilogram = 2.20462262 pounds, 1 pound = 0.45359237 kilograms
                % 1 kilogram = 0.15747304 stones, 1 stone = 6.35029318 kilograms
                
                %=== Impute NaN in the first weight, delete the others
                idxManip = cellfun(@(x) ( (x+1)<1e-6 ), tmp(:,4),'UniformOutput',false);
                low = cellfun(@(x,y) x(y), tmp(:,4),idxManip,'UniformOutput',false);
                low = cellfun(@(x) NaN, low, 'UniformOutput',false);
                for m=1:size(tmp,1)
                    tmp{m,4}(idxManip{m}) = low{m};
                end
                if verboseFlag; fprintf('Replaced %2.0f values which were -1 with NaN.\n',sum(cell2mat(idxManip))); end;
                
                [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
                if verboseFlag; fprintf('Deleted %2.0f values which were 0.\n',N); end;
                
                %=== The following values continually pop up:
                % 0.6 is on patient 3889 (subid 142393)
                % Their weight is constant at 70, then becomes 0.6 at 467 min
                [idxRem2,N] = pnPreprocessRemovalIndices(tmp, idx, 35, 'lt');
                if verboseFlag; fprintf('Deleted %2.0f values which were less than 35.\n',N); end;
                    
                [idxRem3,N] = pnPreprocessRemovalIndices(tmp, idx, 299, 'gt');
                if verboseFlag; fprintf('Deleted %2.0f values which were greater than 299.\n',N); end;
                idxRem = cellfun(@(f,a,t) f | a | t, idxRem, idxRem2, idxRem3, 'UniformOutput',false);
        end
        
        %=== Replace data with tmp
        for m=2:size(data,2)
            data_processed(:,m) = cellfun(@pnPreprocessReplaceData, data_processed(:,m), tmp(:,m), idx, 'UniformOutput', false);
        end
        
        %=== Remove deleted data from 'data'
        if ~isempty(idxRem)
            for m=2:size(data,2)
                data_processed(:,m) = cellfun(@pnPreprocessDeleteData, data_processed(:,m), idxRem, 'UniformOutput', false);
            end
        end
    end
    
end

end

function [data] = pnPreprocessDeleteData(data,idx)
%PNPREPROCESSDELETEDATA	Deletes data in "data" at idx (used with cellfun)
%	[ data ] = pnPreprocessDeleteData(data,idx)
%
%
%	Inputs:
%		data - Vector of data
%		idx  - Index for data to be removed
%
%	Outputs:
%		data - Data with elements at idx deleted
%
%
%	Example
%		data(:,m) = cellfun(@pnPreprocessDeleteData, data(:,m), idx, 'UniformOutput', false);
%
%	See also PNPREPROCESSDATA

%	Copyright 2012 Alistair Johnson

data(idx)=[];

end

function [idxOut, N] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr)
%PNPREPROCESSDELETEDATA	Generates indices to be used to remove data
%	[idxOut] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr)
%       calculates the indices of data which satisfies some condition
%       specified by equalityFcnStr and val. For example, if equalityFcnStr
%       is 'eq' and val is 0, then the function locates the indices of tmp
%       which contain data equal to 0. These indices are then translated
%       into indices in the original data cell, to be used for data
%       deletion at a later point in PNPREPROCESSDATA.
%
%   [idxOut,N] = pnPreprocessRemovalIndices(tmp, idx, val, equalityFcnStr)
%   also outputs the number of entries being deleted.
%
%	Inputs:
%		tmp     - Cell array with data only from a given field
%       idx     - Indices that were used to extract tmp from the original
%           data cell array
%       val     - A value used for comparison with tmp
%		equalityFcnStr - The function used to compare val to data in tmp
%
%	Outputs:
%		idxOut  - Indices of data cell array to be deleted
%		N       - Number of entries to be deleted
%
%	Example:
%       data = pnLoadTextFilesCell([bpath 'set-a']);
%       [tmp,idx] = pnExtractField(data,'HR');
%       [idxRem,N] = pnPreprocessRemovalIndices(tmp, idx, 0, 'eq');
%       for m=2:4
%           data(:,m) = cellfun(@pnPreprocessDeleteData,...
%               data(:,m), idxRem, 'UniformOutput', false);
%       end
%       fprintf('Deleted %2.0f values which were 0.\n',N);
%
%
%	See also PNPREPROCESSDATA

%	Copyright 2012 Alistair Johnson

fcnTmp = str2func(equalityFcnStr);
idxRem = cellfun(@(x) fcnTmp(x,val), tmp(:,4),'UniformOutput',false);
idxOut = cellfun(@pnSubfcnRemIdx, idx, idxRem, 'UniformOutput',false);
N = sum(cell2mat(cellfun(@(x) sum(x), idxOut, 'UniformOutput', false)));

end

function [idx] = pnSubfcnRemIdx(idx, idxRem)
idx(idx) = idxRem;
end

function [data] = pnPreprocessReplaceData(data,tmp,idx)
%PNPREPROCESSREPLACEDATA	Replace data in "data" with "tmp"
%	[data] = pnPreprocessReplaceData(data,tmp,idx) replaces values in data
%	with values in tmp using indices idx to map tmp to data. This function
%	is used with cellfun in PNPREPROCESSDATA.
%
%
%	Inputs:
%		data    - Vector of data
%       tmp     - Vector of data after some form of preprocessing
%		idx     - Indices mapping tmp into data
%
%	Outputs:
%		data    - Vector of data with tmp imputed within
%
%
%	Example
%		data(:,m) = cellfun(@pnReplaceData, data, tmp, idx, 'UniformOutput', false);
%
%	See also PNPREPROCESSDATA

data(idx)=tmp;
end


function [] = pnPreprocessPrintReplacedValues(orig,new,idx)
%PNPREPROCESSPRINTREPLACEDVALUES	Prints values which were modified to
%command window.
%	pnPreprocessReplaceData(new,orig,idx) prints values replaced in the
%	given preprocessing step followed by their new value
%
%	Inputs:
%		new     - Vector of data after some form of preprocessing
%       orig    - Vector of data
%		idx     - Indices mapping tmp into data
%
%	Outputs:
%       List of values which were changed
%
%	Example
%		data(:,m) = cellfun(@pnReplaceData, data, tmp, idx, 'UniformOutput', false);
%
%	See also PNPREPROCESSDATA

%=== Extract cell indices which contain changed data
idxChanged = find(cellfun(@(x) ~isempty(x) && any(x), idx)==1);

for k=1:numel(idxChanged)
    idxChangedTemp = find(idx{idxChanged(k)}==1);
    for m=1:numel(idxChangedTemp)
        fprintf('%3.2f --> %3.2f\n',...
            orig{idxChanged(k)}(idxChangedTemp(m)),...
            new{idxChanged(k)}(m));
    end
end

end