Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0
(8,813 bytes)
function [ raw, raw_header ] = pnSubsampleData(data, minutes, type, desiredFeats)
%PNSUBSAMPLEDATA Subsample data by taking the worst value in X minutes
% [ raw ] = pnSubsampleData(data, minutes) creates a cell array with columns
% representing features and rows representing new observations. The first
% row is a header row, and each feature is generated by subsampling the
% data in structure 'data' using the worst value every 'minutes' minutes.
%
%
% Inputs:
% data - Structure with fields for each observation. Each
% field then contains a cell array of the data in
% standard format.
% minutes - Time periods used to segment the data into (e.g.,
% 24*60 for one day)
% type - Whether to pick the lowest or highest value in each
% time period
% 'low' (default) - Extract the lowest value in each time period
% 'high' - Extract the highest value in each time period
% desiredFeats - Optional cell array of strings specifying that only
% a subset of features should be extracted, e.g., specify
% {'Urine','pH'} to only extract subsampled data for
% urine and pH.
%
% Outputs:
% raw - Double matrix which is Nx(P*D) (N observations, D
% features extracted, P time periods). Each column contains a single
% value for each observation which were the lowest or highest for the
% given time period.
% raw_header - Header file indicating what the columns are in raw.
% The header has a number appended indicating the first time value
% for that period (e.g., pH_60 contains pH values after 60 minutes.
% If followed later in header by pH_120, then pH_60 can be
% understood as the lowest or highest pH value between 60 and 120
% minutes for each patient).
%
%
% Example
% %=== Load data in
% load('data_processed_cell.mat');
%
% %=== Choose features to extract from
% header_extract = {'Urine','Platelets','BUN','Creatinine','PaO2'};
%
% %=== Extract highest values over 48 hours
% [high,header_H] = pnSubsampleData(data, 60*48,'high',header_extract); % Highest data for 48 hours
%
% See also PNLOADTEXTFILESCELL PNEXTRACTFIELD PNMAIN
% References:
% Physionet Challenge 2012
% http://physionet.org/challenge/2012/
% Copyright 2012 Alistair Johnson
% $LastChangedBy: alistair $
% $LastChangedDate: 2012-04-24 22:39:24 +0100 (Tue, 24 Apr 2012) $
% $Revision: 342 $
% Originally written on GLNXA64 by Alistair Johnson, 25-Jan-2012 18:48:19
% Contact: alistairewj@gmail.com
if nargin<4
desiredFeats = [];
end
if nargin<3
type = 'low';
end
if isstruct(data)
fn = fieldnames(data);
Nfn = length(fn);
else
fn = data(:,1);
Nfn = length(fn);
end
maxTime = 48*60; % Maximum time for data is 48 hours in the ICU
nSamples = 0:minutes:maxTime; % Number of features to create for each meas.
if nSamples(end)==maxTime
% Do not include last value
nSamples=nSamples(1:end-1);
end
%=== convert to cell array of strings
sampStrings = arrayfun(@(x) ['_' num2str(x)], nSamples,'UniformOutput',false);
%=== Create features for the 5 variables which do not change temporally
fixedVars = {'RecordID','Age','Gender','Height','Weight'};
%=== Create a feature for each variable at each time point
tempVars = {'ALP';'ALT';'AST';'Albumin';'BUN';'Bilirubin';'Cholesterol';'Creatinine';'DiasABP';'FiO2';'GCS';'Gender';'Glucose';'HCO3';'HCT';'HR';'Height';'K';'Lactate';'MAP';'MechVent';'Mg';'NIDiasABP';'NIMAP';'NISysABP';'Na';'PaCO2';'PaO2';'Platelets';'RecordID';'RespRate';'SaO2';'SysABP';'Temp';'TroponinI';'TroponinT';'Urine';'WBC';'Weight';'pH';};
%=== Remove unwanted features
%=== If user specified certain features, remove the others
if isempty(desiredFeats)
%=== All features will be extracted
else
idxFixedRemove = true(size(fixedVars));
idxTemporalRemove = true(size(tempVars));
for k=1:numel(desiredFeats)
idxFixed = strcmp(fixedVars,desiredFeats{k});
idxTemporal = strcmp(tempVars,desiredFeats{k});
if any(idxFixed)
idxFixedRemove(idxFixed) = false;
end
if any(idxTemporal)
idxTemporalRemove(idxTemporal) = false;
end
end
%=== Remove unwanted features
fixedVars(idxFixedRemove) = [];
tempVars(idxTemporalRemove) = [];
end
temporalVars = strcat(repmat(tempVars,1,size(sampStrings,2)),...
repmat(sampStrings,size(tempVars,1),1));
temporalVars = reshape(temporalVars,1,size(temporalVars,1)*size(temporalVars,2));
nVar = length(tempVars);
%=== Create a cell array (pointer) for all patients
if isstruct(data)
patData = structfun(@(x) x(1:end,:),data,'UniformOutput',false);
patData=struct2cell(patData);
else
patData = data(:,2:end);
end
%=== Pre-allocate
raw = zeros(Nfn,length(fixedVars)+length(temporalVars));
raw_header = [fixedVars,temporalVars];
%=== First search for the 5 "fixed variables" in the first 20 data elements
for v=1:length(fixedVars)
%=== Scan for all entries
%=== Extract indices of the variable desired
varDataTemp = cellfun(@(x) strcmp(x,fixedVars{v}), patData(:,2),'UniformOutput',false);
%=== Get indices of the fixed vars to remove from the data set
idxRemove = cellfun(@(x) x==1, varDataTemp,'UniformOutput',false);
%=== Find missing values, i.e. variable not found
idxMissing = cellfun(@(x) any(x), idxRemove);
idxUse=cell(size(idxMissing));
%=== Extract a single index to use for data
% Only one data value will be used, but multiple entries may be
% removed. For example, while there may be 3 entries for "Age", only
% the final "Age" value is extracted
if strcmp(fixedVars{v},'Weight')
idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'first'), idxRemove(idxMissing),'UniformOutput',false);
idxRemove = cellfun(@removeFirstWeight, idxRemove, idxUse,'UniformOutput',false); % Only remove first weight value
else
idxUse(idxMissing) = cellfun(@(x) find(x==1,1,'last'), idxRemove(idxMissing),'UniformOutput',false);
end
%=== Add data to raw
tmp = cellfun(@(x,y) x(y), patData(:,3), idxUse,'UniformOutput',false);
raw(idxMissing,v) = cell2mat(tmp(idxMissing));
%=== Remove data from patData
idxKeep = cellfun(@(x) ~x, idxRemove,'UniformOutput',false);
for k=1:size(patData,2)
patData(:,k) = cellfun(@(x,y) x(y), patData(:,k), idxKeep,'UniformOutput',false);
end
end
%=== Cycle through each temporal split, create an index
idxSamples=cell(Nfn,length(nSamples));
for t=1:(length(nSamples)-1)
idxSamples(:,t) = cellfun(@(x) x<nSamples(t+1),patData(:,1),'UniformOutput',false);
end
%=== Final split
idxSamples(:,end) = cellfun(@(x) x>nSamples(end),patData(:,1),'UniformOutput',false);
%=== Calculate for each different split (e.g. every 24*60 minutes..)
for t=1:length(nSamples)
%=== For each time window, extract data relevant
currData = cell(size(patData,1),2);
currData(:,1) = cellfun(@(x,y) x(y), patData(:,2), idxSamples(:,t),'UniformOutput',false); % feature names
currData(:,2) = cellfun(@(x,y) x(y), patData(:,3), idxSamples(:,t),'UniformOutput',false); % feature data
%=== Then scan for each individual variable
for v=1:length(tempVars)
%=== Extract indices of the variable desired
varDataTemp = cellfun(@(x) strcmp(x,tempVars{v}), currData(:,1),'UniformOutput',false);
%=== Extract the value of each occurence of that variable in the window
varDataTemp = cellfun(@(x,y) x(y)', currData(:,2), varDataTemp,'UniformOutput',false);
%=== Replace each empty cell with NaN, the missing flag
idxEmpty = cellfun(@(x) isempty(x), varDataTemp);
varDataTemp(idxEmpty) = {NaN};
%=== Calculate index for raw
idxData = (t-1)*nVar+v+numel(fixedVars);
%=== *** SPECIAL CASE ***
if strcmp(tempVars{v},'Urine')
%=== Urine values should be summed
varDataTemp = cellfun(@(x) sum(x,2),varDataTemp,'UniformOutput',false);
end
%=== Take minimum value as data to add too raw
if strcmp(type,'low')
raw(:,idxData) = cellfun(@(x) min(x,[],2), varDataTemp);
else
raw(:,idxData) = cellfun(@(x) max(x,[],2), varDataTemp);
end
end
end
end
function [x] = removeFirstWeight(x,y)
x=false(size(x)); x(y)=true;
end