function [Y,ndx,dbg] = natsort(X,rgx,varargin)
% Natural-order / alphanumeric sort strings or character vectors or categorical.
%
% (c) 2012-2021 Stephen Cobeldick
%
% Sorts text by character code and by number value. By default matches
% integer substrings and performs a case-insensitive ascending sort.
% Options select other number formats, sort order, case sensitivity, etc.
%
%%% Example:
% >> X = {'x2', 'x10', 'x1'};
% >> sort(X) % wrong number order
% ans = 'x1' 'x10' 'x2'
% >> natsort(X)
% ans = 'x1' 'x2' 'x10'
%
%%% Syntax:
% Y = natsort(X)
% Y = natsort(X,rgx)
% Y = natsort(X,rgx,<options>)
% [Y,ndx,dbg] = natsort(X,...)
%
% To sort any file-names or folder-names use NATSORTFILES (File Exchange 47434)
% To sort the rows of a string/cell array use NATSORTROWS (File Exchange 47433)
%
%% Number Substrings %%
%
% By default consecutive digit characters are interpreted as an integer.
% Specifying the optional regular expression pattern allows the numbers to
% include a +/- sign, decimal digits, exponent E-notation, quantifiers,
% or look-around matching. For information on defining regular expressions:
% <http://www.mathworks.com/help/matlab/matlab_prog/regular-expressions.html>
%
% The number substrings are parsed by SSCANF into numeric values, using
% either the **default format '%f' or the user-supplied format specifier.
%
% This table shows examples of some regular expression patterns for common
% notations and ways of writing numbers, together with suitable SSCANF formats:
%
% Regular | Number Substring | Number Substring | SSCANF
% Expression: | Match Examples: | Match Description: | Format Specifier:
% ==============|==================|===============================|==================
% ** \d+ | 0, 123, 4, 56789 | unsigned integer | %f %i %u %lu
% --------------|------------------|-------------------------------|------------------
% [-+]?\d+ | +1, 23, -45, 678 | integer with optional +/- sign| %f %i %d %ld
% --------------|------------------|-------------------------------|------------------
% \d+\.?\d* | 012, 3.45, 678.9 | integer or decimal | %f
% (\d+|Inf|NaN) | 123, 4, NaN, Inf | integer, Inf, or NaN | %f
% \d+\.\d+E\d+ | 0.123e4, 5.67e08 | exponential notation | %f
% --------------|------------------|-------------------------------|------------------
% 0X[0-9A-F]+ | 0X0, 0X3E7, 0XFF | hexadecimal notation & prefix | %x %i
% [0-9A-F]+ | 0, 3E7, FF | hexadecimal notation | %x
% --------------|------------------|-------------------------------|------------------
% 0[0-7]+ | 012, 03456, 0700 | octal notation & prefix | %o %i
% [0-7]+ | 12, 3456, 700 | octal notation | %o
% --------------|------------------|-------------------------------|------------------
% 0B[01]+ | 0B1, 0B101, 0B10 | binary notation & prefix | %b (not SSCANF)
% [01]+ | 1, 101, 10 | binary notation | %b (not SSCANF)
% --------------|------------------|-------------------------------|------------------
%
%% Debugging Output Array %%
%
% The third output is a cell array <dbg>, for checking the numbers
% matched by the regular expression <rgx> and converted to numeric
% by the SSCANF format. The rows of <dbg> are linearly indexed from <X>.
%
% >> [~,~,dbg] = natsort(X)
% dbg =
% 'x' [ 2]
% 'x' [10]
% 'x' [ 1]
%
%% Examples %%
%
%%% Multiple integers (e.g. release version numbers):
% >> A = {'v10.6', 'v9.10', 'v9.5', 'v10.10', 'v9.10.20', 'v9.10.8'};
% >> sort(A)
% ans = 'v10.10' 'v10.6' 'v9.10' 'v9.10.20' 'v9.10.8' 'v9.5'
% >> natsort(A)
% ans = 'v9.5' 'v9.10' 'v9.10.8' 'v9.10.20' 'v10.6' 'v10.10'
%
%%% Integer, decimal, NaN, or Inf numbers, possibly with +/- signs:
% >> B = {'test+NaN', 'test11.5', 'test-1.4', 'test', 'test-Inf', 'test+0.3'};
% >> sort(B)
% ans = 'test' 'test+0.3' 'test+NaN' 'test-1.4' 'test-Inf' 'test11.5'
% >> natsort(B, '[-+]?(NaN|Inf|\d+\.?\d*)')
% ans = 'test' 'test-Inf' 'test-1.4' 'test+0.3' 'test11.5' 'test+NaN'
%
%%% Integer or decimal numbers, possibly with an exponent:
% >> C = {'0.56e007', '', '43E-2', '10000', '9.8'};
% >> sort(C)
% ans = '' '0.56e007' '10000' '43E-2' '9.8'
% >> natsort(C, '\d+\.?\d*([eE][-+]?\d+)?')
% ans = '' '43E-2' '9.8' '10000' '0.56e007'
%
%%% Hexadecimal numbers (with '0X' prefix):
% >> D = {'a0X7C4z', 'a0X5z', 'a0X18z', 'a0XFz'};
% >> sort(D)
% ans = 'a0X18z' 'a0X5z' 'a0X7C4z' 'a0XFz'
% >> natsort(D, '0X[0-9A-F]+', '%i')
% ans = 'a0X5z' 'a0XFz' 'a0X18z' 'a0X7C4z'
%
%%% Binary numbers:
% >> E = {'a11111000100z', 'a101z', 'a000000000011000z', 'a1111z'};
% >> sort(E)
% ans = 'a000000000011000z' 'a101z' 'a11111000100z' 'a1111z'
% >> natsort(E, '[01]+', '%b')
% ans = 'a101z' 'a1111z' 'a000000000011000z' 'a11111000100z'
%
%%% Case sensitivity:
% >> F = {'a2', 'A20', 'A1', 'a10', 'A2', 'a1'};
% >> natsort(F, [], 'ignorecase') % default
% ans = 'A1' 'a1' 'a2' 'A2' 'a10' 'A20'
% >> natsort(F, [], 'matchcase')
% ans = 'A1' 'A2' 'A20' 'a1' 'a2' 'a10'
%
%%% Sort order:
% >> G = {'2', 'a', '', '3', 'B', '1'};
% >> natsort(G, [], 'ascend') % default
% ans = '' '1' '2' '3' 'a' 'B'
% >> natsort(G, [], 'descend')
% ans = 'B' 'a' '3' '2' '1' ''
% >> natsort(G, [], 'num<char') % default
% ans = '' '1' '2' '3' 'a' 'B'
% >> natsort(G, [], 'char<num')
% ans = '' 'a' 'B' '1' '2' '3'
%
%%% UINT64 numbers (with full precision):
% >> natsort({'a18446744073709551615z', 'a18446744073709551614z'}, [], '%lu')
% ans = 'a18446744073709551614z' 'a18446744073709551615z'
%
%% Input and Output Arguments %%
%
%%% Inputs (**==default):
% X = Array to be sorted into alphanumeric order. Can be a
% string array, or a cell array of char vectors, or a categorical
% array, or any other array type which can be converted by CELLSTR.
% rgx = Regular expression to match number substrings, '\d+'**
% = [] uses the default regular expression.
% <options> can be entered in any order, as many as required:
% = Sort direction: 'descend'/'ascend'**
% = NaN/number order: 'NaN<num'/'num<NaN'**
% = Character/number order: 'char<num'/'num<char'**
% = Character case handling: 'matchcase'/'ignorecase'**
% = SSCANF conversion format: '%x', '%li', '%b', '%f'**, etc.
%
%%% Outputs:
% Y = Array X sorted into alphanumeric order.
% ndx = NumericArray, such that Y = X(ndx). The same size as X.
% dbg = CellArray of the parsed characters and number values.
% Each row corresponds to one input element, linear-indexed from X.
%
% See also SORT NATSORTFILES NATSORTROWS CELLSTR REGEXP IREGEXP SSCANF
%% Input Wrangling %%
%
fun = @(c)cellfun('isclass',c,'char') & cellfun('size',c,1)<2 & cellfun('ndims',c)<3;
%
if iscell(X)
assert(all(fun(X(:))),...
'SC:natsort:X:CellInvalidContent',...
'First input <X> cell array must contain only character vectors.')
Y = X(:);
elseif ischar(X) % Convert char matrix:
Y = cellstr(X);
else % Convert string, categorical, datetime, etc.:
Y = cellstr(X(:));
end
%
if nargin<2 || isnumeric(rgx) && isequal(rgx,[])
rgx = '\d+';
elseif ischar(rgx)
assert(ndims(rgx)<3 && size(rgx,1)==1,...
'SC:natsort:rgx:NotCharVector',...
'Second input <rgx> character vector must have size 1xN.') %#ok<ISMAT>
nsChkRgx(rgx)
else
rgx = ns1s2c(rgx);
assert(ischar(rgx),...
'SC:natsort:rgx:InvalidType',...
'Second input <rgx> must be a character vector or a string scalar.')
nsChkRgx(rgx)
end
%
varargin = cellfun(@ns1s2c, varargin, 'uni',false);
%
assert(all(fun(varargin)),...
'SC:natsort:option:InvalidType',...
'All optional arguments must be character vectors or string scalars.')
%
% Character case:
ccm = strcmpi(varargin,'matchcase');
ccx = strcmpi(varargin,'ignorecase')|ccm;
% Sort direction:
sdd = strcmpi(varargin,'descend');
sdx = strcmpi(varargin,'ascend')|sdd;
% Char/num order:
orb = strcmpi(varargin,'char<num');
orx = strcmpi(varargin,'num<char')|orb;
% NaN/num order:
nab = strcmpi(varargin,'NaN<num');
nax = strcmpi(varargin,'num<NaN')|nab;
% SSCANF format:
sfx = ~cellfun('isempty',regexp(varargin,'^%([bdiuoxfeg]|l[diuox])$'));
%
nsAssert(1,varargin,orx,'CharNumOrder', 'char<->num')
nsAssert(1,varargin,nax, 'NanNumOrder', 'NaN<->num')
nsAssert(1,varargin,sdx,'SortDirection','Sort direction')
nsAssert(1,varargin,sfx,'sscanfFormat', 'SSCANF format')
nsAssert(0,varargin,~(ccx|sdx|orx|nax|sfx),'UnusedOption')
%
% SSCANF format:
if nnz(sfx)
fmt = varargin{sfx};
else
fmt = '%f';
end
%
%% Identify and Convert Numbers %%
%
[nbr,spl] = regexpi(Y(:),rgx, 'match','split', varargin{ccx});
%
if numel(nbr)
tmp = [nbr{:}];
if strcmp(fmt,'%b')
tmp = regexprep(tmp,'^0[Bb]','');
vec = cellfun(@(s)pow2(numel(s)-1:-1:0)*sscanf(s,'%1d'),tmp);
else
vec = sscanf(sprintf(' %s',tmp{:}),fmt);
end
assert(numel(vec)==numel(tmp),...
'SC:natsort:sscanf:TooManyValues',...
'The %s format must return one value for each input number.',fmt)
else
vec = [];
end
%
%% Allocate Data %%
%
% Determine lengths:
nmx = numel(Y);
lnn = cellfun('length',nbr);
lns = cellfun('length',spl);
mxs = max(lns);
%
% Allocate data:
idn = logical(bsxfun(@le,1:mxs,lnn).'); % TRANSPOSE bug loses type (R2013b)
ids = logical(bsxfun(@le,1:mxs,lns).'); % TRANSPOSE bug loses type (R2013b)
arn = zeros(mxs,nmx,class(vec));
ars = cell(mxs,nmx);
ars(:) = {''};
ars(ids) = [spl{:}];
arn(idn) = vec;
%
%% Debugging Array %%
%
if nargout>2
mxw = 0;
for k = 1:nmx
mxw = max(mxw,numel(nbr{k})+nnz(~cellfun('isempty',spl{k})));
end
dbg = cell(nmx,mxw);
for k = 1:nmx
tmp = spl{k};
tmp(2,1:end-1) = num2cell(arn(idn(:,k),k));
tmp(cellfun('isempty',tmp)) = [];
dbg(k,1:numel(tmp)) = tmp;
end
end
%
%% Sort Columns %%
%
if ~any(ccm) % ignorecase
ars = lower(ars);
end
%
if any(orb) % char<num
% Determine max character code:
mxc = 'X';
tmp = warning('off','all');
mxc(1) = Inf;
warning(tmp)
mxc(mxc==0) = 255; % Octave
% Append character code to split text:
for k = reshape(find(idn),1,[])
ars{k}(1,end+1) = mxc;
end
end
%
idn(isnan(arn)) = ~any(nab); % NaN<num
%
if any(sdd) % descending
[~,ndx] = sort(nsGroup(ars(mxs,:)),'descend');
for k = mxs-1:-1:1
[~,idx] = sort(arn(k,ndx),'descend');
ndx = ndx(idx);
[~,idx] = sort(idn(k,ndx),'descend');
ndx = ndx(idx);
[~,idx] = sort(nsGroup(ars(k,ndx)),'descend');
ndx = ndx(idx);
end
else % ascending
[~,ndx] = sort(ars(mxs,:)); % ascend
for k = mxs-1:-1:1
[~,idx] = sort(arn(k,ndx),'ascend');
ndx = ndx(idx);
[~,idx] = sort(idn(k,ndx),'ascend');
ndx = ndx(idx);
[~,idx] = sort(ars(k,ndx)); % ascend
ndx = ndx(idx);
end
end
%
%% Outputs %%
%
if ischar(X)
ndx = ndx(:);
Y = X(ndx,:);
else
ndx = reshape(ndx,size(X));
Y = X(ndx);
end
%
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%natsort
function nsChkRgx(rgx)
tmp = '^((match|ignore)case|(de|a)scend|(char|nan|num)[<>](char|nan|num)|%l?[a-z])$';
assert(isempty(regexp(rgx,tmp,'once')),'SC:natsort:rgx:OptionMixUp',...
['Second input <rgx> must be a regular expression that matches numbers.',...
'\nThe provided input "%s" is one the optional arguments (inputs 3+).'],rgx)
if isempty(regexpi('0',rgx,'once'))
warning('SC:natsort:rgx:SanityCheck',...
['Second input <rgx> must be a regular expression that matches numbers.',...
'\nThe provided regular expression "%s" does not match "0", which\n',...
'may be acceptable (e.g. if literals, quantifiers, or lookarounds are used).'],rgx)
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsChkRgx
function arr = ns1s2c(arr)
% If scalar string then extract the character vector, otherwise data is unchanged.
if isa(arr,'string') && isscalar(arr)
arr = arr{1};
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%ns1s2c
function nsAssert(val,inp,idx,ids,varargin)
% Throw an error if an option is overspecified.
if nnz(idx)>val
tmp = {'Unknown input arguments',...
' option may only be specified once. Provided inputs'};
error(sprintf('SC:natsort:option:%sOverspecified',ids),...
'%s:%s',[varargin{:},tmp{1+val}],sprintf('\n''%s''',inp{idx}))
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsAssert
function grp = nsGroup(vec)
% Groups in a cell array of char vectors, equivalent to [~,~,grp]=unique(vec);
[vec,idx] = sort(vec);
grp = cumsum([true,~strcmp(vec(1:end-1),vec(2:end))]);
grp(idx) = grp;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsGroup