Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: <base>/sources/alistairewj_at_gmail.com/entry4/pnSubsampleData.m (8,813 bytes)
function [ raw, raw_header ] = pnSubsampleData(data, minutes, type, desiredFeats)
%PNSUBSAMPLEDATA	Subsample data by taking the worst value in X minutes
%	[ raw ] = pnSubsampleData(data, minutes) creates a cell array with columns
%	representing features and rows representing new observations. The first
%	row is a header row, and each feature is generated by subsampling the
%	data in structure 'data' using the worst value every 'minutes' minutes.
%	
%
%	Inputs:
%		data        - Structure with fields for each observation. Each 
%                   field then contains a cell array of the data in 
%                   standard format.
%       minutes     - Time periods used to segment the data into (e.g.,
%                   24*60 for one day)
%       type        - Whether to pick the lowest or highest value in each
%                   time period
%           'low' (default) - Extract the lowest value in each time period
%           'high' - Extract the highest value in each time period
%       desiredFeats - Optional cell array of strings specifying that only
%                   a subset of features should be extracted, e.g., specify
%                   {'Urine','pH'} to only extract subsampled data for
%                   urine and pH.
%       
%	Outputs:
%		raw         - Double matrix which is Nx(P*D) (N observations, D
%		features extracted, P time periods). Each column contains a single 
%		value for each observation which were the lowest or highest for the
%       given time period.
%       raw_header  - Header file indicating what the columns are in raw.
%       The header has a number appended indicating the first time value
%       for that period (e.g., pH_60 contains pH values after 60 minutes.
%       If followed later in header by pH_120, then pH_60 can be
%       understood as the lowest or highest pH value between 60 and 120
%       minutes for each patient).
%		
%
%	Example
%       %=== Load data in
%       load('data_processed_cell.mat');
%       
%       %=== Choose features to extract from
%       header_extract = {'Urine','Platelets','BUN','Creatinine','PaO2'};
%       
%       %=== Extract highest values over 48 hours
%       [high,header_H] = pnSubsampleData(data, 60*48,'high',header_extract); % Highest data for 48 hours
%	
%	See also PNLOADTEXTFILESCELL PNEXTRACTFIELD PNMAIN

%	References:
%       Physionet Challenge 2012
%       http://physionet.org/challenge/2012/

%	Copyright 2012 Alistair Johnson

%	$LastChangedBy: alistair $
%	$LastChangedDate: 2012-04-24 22:39:24 +0100 (Tue, 24 Apr 2012) $
%	$Revision: 342 $
%	Originally written on GLNXA64 by Alistair Johnson, 25-Jan-2012 18:48:19
%	Contact: alistairewj@gmail.com

if nargin<4
    desiredFeats = [];
end

if nargin<3
    type = 'low';
end

if isstruct(data)
    fn = fieldnames(data);
    Nfn = length(fn);
else
    fn = data(:,1);
    Nfn = length(fn);
end

maxTime = 48*60; % Maximum time for data is 48 hours in the ICU
nSamples = 0:minutes:maxTime; % Number of features to create for each meas.
if nSamples(end)==maxTime
     % Do not include last value
    nSamples=nSamples(1:end-1);
end

%=== convert to cell array of strings
sampStrings = arrayfun(@(x) ['_' num2str(x)], nSamples,'UniformOutput',false);

%=== Create features for the 5 variables which do not change temporally
fixedVars = {'RecordID','Age','Gender','Height','Weight'};

%=== Create a feature for each variable at each time point
tempVars = {'ALP';'ALT';'AST';'Albumin';'BUN';'Bilirubin';'Cholesterol';'Creatinine';'DiasABP';'FiO2';'GCS';'Gender';'Glucose';'HCO3';'HCT';'HR';'Height';'K';'Lactate';'MAP';'MechVent';'Mg';'NIDiasABP';'NIMAP';'NISysABP';'Na';'PaCO2';'PaO2';'Platelets';'RecordID';'RespRate';'SaO2';'SysABP';'Temp';'TroponinI';'TroponinT';'Urine';'WBC';'Weight';'pH';};

%=== Remove unwanted features

%=== If user specified certain features, remove the others
if isempty(desiredFeats)
    %=== All features will be extracted
else
    idxFixedRemove = true(size(fixedVars));
    idxTemporalRemove = true(size(tempVars));
    for k=1:numel(desiredFeats)
        idxFixed = strcmp(fixedVars,desiredFeats{k});
        idxTemporal = strcmp(tempVars,desiredFeats{k});
        
        if any(idxFixed)
            idxFixedRemove(idxFixed) = false;
        end
        if any(idxTemporal)
            idxTemporalRemove(idxTemporal) = false;
        end
    end
    
    %=== Remove unwanted features
    fixedVars(idxFixedRemove) = [];
    tempVars(idxTemporalRemove) = [];
end
    
temporalVars = strcat(repmat(tempVars,1,size(sampStrings,2)),...
    repmat(sampStrings,size(tempVars,1),1));
temporalVars = reshape(temporalVars,1,size(temporalVars,1)*size(temporalVars,2));
nVar = length(tempVars);
%=== Create a cell array (pointer) for all patients
if isstruct(data)
    patData = structfun(@(x) x(1:end,:),data,'UniformOutput',false);
    patData=struct2cell(patData);
else
    patData = data(:,2:end);
end

    %=== Pre-allocate
    raw = zeros(Nfn,length(fixedVars)+length(temporalVars));
    raw_header = [fixedVars,temporalVars];


%=== First search for the 5 "fixed variables" in the first 20 data elements
for v=1:length(fixedVars)
    %=== Scan for all entries
    %=== Extract indices of the variable desired
    varDataTemp = cellfun(@(x) strcmp(x,fixedVars{v}), patData(:,2),'UniformOutput',false);
    
    %=== Get indices of the fixed vars to remove from the data set
    idxRemove = cellfun(@(x) x==1, varDataTemp,'UniformOutput',false);
    
    %=== Find missing values, i.e. variable not found
    idxMissing = cellfun(@(x) any(x), idxRemove);
    idxUse=cell(size(idxMissing));
    
    %=== Extract a single index to use for data
    % Only one data value will be used, but multiple entries may be
    % removed. For example, while there may be 3 entries for "Age", only
    % the final "Age" value is extracted
    if strcmp(fixedVars{v},'Weight')
        idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'first'), idxRemove(idxMissing),'UniformOutput',false);
        idxRemove = cellfun(@removeFirstWeight, idxRemove, idxUse,'UniformOutput',false); % Only remove first weight value
    else
        idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'last'), idxRemove(idxMissing),'UniformOutput',false);
    end
    %=== Add data to raw
    tmp = cellfun(@(x,y) x(y), patData(:,3), idxUse,'UniformOutput',false);
    raw(idxMissing,v) = cell2mat(tmp(idxMissing));
    
    %=== Remove data from patData
    idxKeep = cellfun(@(x) ~x, idxRemove,'UniformOutput',false);
    for k=1:size(patData,2)
        patData(:,k) = cellfun(@(x,y) x(y), patData(:,k), idxKeep,'UniformOutput',false);
    end
end


%=== Cycle through each temporal split, create an index
idxSamples=cell(Nfn,length(nSamples));
for t=1:(length(nSamples)-1)
    idxSamples(:,t) = cellfun(@(x) x<nSamples(t+1),patData(:,1),'UniformOutput',false);
end
%=== Final split
idxSamples(:,end) = cellfun(@(x) x>nSamples(end),patData(:,1),'UniformOutput',false);

%=== Calculate for each different split (e.g. every 24*60 minutes..)
for t=1:length(nSamples)
    %=== For each time window, extract data relevant
    currData = cell(size(patData,1),2);
    currData(:,1) = cellfun(@(x,y) x(y), patData(:,2), idxSamples(:,t),'UniformOutput',false); % feature names
    currData(:,2) = cellfun(@(x,y) x(y), patData(:,3), idxSamples(:,t),'UniformOutput',false); % feature data
    %=== Then scan for each individual variable
    for v=1:length(tempVars)
        %=== Extract indices of the variable desired
        varDataTemp = cellfun(@(x) strcmp(x,tempVars{v}), currData(:,1),'UniformOutput',false);
        %=== Extract the value of each occurence of that variable in the window
        varDataTemp = cellfun(@(x,y) x(y)', currData(:,2), varDataTemp,'UniformOutput',false);
        
        %=== Replace each empty cell with NaN, the missing flag
        idxEmpty = cellfun(@(x) isempty(x), varDataTemp);
        varDataTemp(idxEmpty) = {NaN};
        
        %=== Calculate index for raw
        idxData = (t-1)*nVar+v+numel(fixedVars);
        
        %=== *** SPECIAL CASE ***
        if strcmp(tempVars{v},'Urine')
            %=== Urine values should be summed
            varDataTemp = cellfun(@(x) sum(x,2),varDataTemp,'UniformOutput',false);
        end
        
        %=== Take minimum value as data to add too raw
        if strcmp(type,'low')
            raw(:,idxData) = cellfun(@(x) min(x,[],2), varDataTemp);
        else
            raw(:,idxData) = cellfun(@(x) max(x,[],2), varDataTemp);
        end
            
    end
end
end

function [x] = removeFirstWeight(x,y)
x=false(size(x)); x(y)=true;
end