function [Y,ndx,dbg] = natsort(X,rgx,varargin) % Natural-order / alphanumeric sort strings or character vectors or categorical. % % (c) 2012-2021 Stephen Cobeldick % % Sorts text by character code and by number value. By default matches % integer substrings and performs a case-insensitive ascending sort. % Options select other number formats, sort order, case sensitivity, etc. % %%% Example: % >> X = {'x2', 'x10', 'x1'}; % >> sort(X) % wrong number order % ans = 'x1' 'x10' 'x2' % >> natsort(X) % ans = 'x1' 'x2' 'x10' % %%% Syntax: % Y = natsort(X) % Y = natsort(X,rgx) % Y = natsort(X,rgx,) % [Y,ndx,dbg] = natsort(X,...) % % To sort any file-names or folder-names use NATSORTFILES (File Exchange 47434) % To sort the rows of a string/cell array use NATSORTROWS (File Exchange 47433) % %% Number Substrings %% % % By default consecutive digit characters are interpreted as an integer. % Specifying the optional regular expression pattern allows the numbers to % include a +/- sign, decimal digits, exponent E-notation, quantifiers, % or look-around matching. For information on defining regular expressions: % % % The number substrings are parsed by SSCANF into numeric values, using % either the **default format '%f' or the user-supplied format specifier. % % This table shows examples of some regular expression patterns for common % notations and ways of writing numbers, together with suitable SSCANF formats: % % Regular | Number Substring | Number Substring | SSCANF % Expression: | Match Examples: | Match Description: | Format Specifier: % ==============|==================|===============================|================== % ** \d+ | 0, 123, 4, 56789 | unsigned integer | %f %i %u %lu % --------------|------------------|-------------------------------|------------------ % [-+]?\d+ | +1, 23, -45, 678 | integer with optional +/- sign| %f %i %d %ld % --------------|------------------|-------------------------------|------------------ % \d+\.?\d* | 012, 3.45, 678.9 | integer or decimal | %f % (\d+|Inf|NaN) | 123, 4, NaN, Inf | integer, Inf, or NaN | %f % \d+\.\d+E\d+ | 0.123e4, 5.67e08 | exponential notation | %f % --------------|------------------|-------------------------------|------------------ % 0X[0-9A-F]+ | 0X0, 0X3E7, 0XFF | hexadecimal notation & prefix | %x %i % [0-9A-F]+ | 0, 3E7, FF | hexadecimal notation | %x % --------------|------------------|-------------------------------|------------------ % 0[0-7]+ | 012, 03456, 0700 | octal notation & prefix | %o %i % [0-7]+ | 12, 3456, 700 | octal notation | %o % --------------|------------------|-------------------------------|------------------ % 0B[01]+ | 0B1, 0B101, 0B10 | binary notation & prefix | %b (not SSCANF) % [01]+ | 1, 101, 10 | binary notation | %b (not SSCANF) % --------------|------------------|-------------------------------|------------------ % %% Debugging Output Array %% % % The third output is a cell array , for checking the numbers % matched by the regular expression and converted to numeric % by the SSCANF format. The rows of are linearly indexed from . % % >> [~,~,dbg] = natsort(X) % dbg = % 'x' [ 2] % 'x' [10] % 'x' [ 1] % %% Examples %% % %%% Multiple integers (e.g. release version numbers): % >> A = {'v10.6', 'v9.10', 'v9.5', 'v10.10', 'v9.10.20', 'v9.10.8'}; % >> sort(A) % ans = 'v10.10' 'v10.6' 'v9.10' 'v9.10.20' 'v9.10.8' 'v9.5' % >> natsort(A) % ans = 'v9.5' 'v9.10' 'v9.10.8' 'v9.10.20' 'v10.6' 'v10.10' % %%% Integer, decimal, NaN, or Inf numbers, possibly with +/- signs: % >> B = {'test+NaN', 'test11.5', 'test-1.4', 'test', 'test-Inf', 'test+0.3'}; % >> sort(B) % ans = 'test' 'test+0.3' 'test+NaN' 'test-1.4' 'test-Inf' 'test11.5' % >> natsort(B, '[-+]?(NaN|Inf|\d+\.?\d*)') % ans = 'test' 'test-Inf' 'test-1.4' 'test+0.3' 'test11.5' 'test+NaN' % %%% Integer or decimal numbers, possibly with an exponent: % >> C = {'0.56e007', '', '43E-2', '10000', '9.8'}; % >> sort(C) % ans = '' '0.56e007' '10000' '43E-2' '9.8' % >> natsort(C, '\d+\.?\d*([eE][-+]?\d+)?') % ans = '' '43E-2' '9.8' '10000' '0.56e007' % %%% Hexadecimal numbers (with '0X' prefix): % >> D = {'a0X7C4z', 'a0X5z', 'a0X18z', 'a0XFz'}; % >> sort(D) % ans = 'a0X18z' 'a0X5z' 'a0X7C4z' 'a0XFz' % >> natsort(D, '0X[0-9A-F]+', '%i') % ans = 'a0X5z' 'a0XFz' 'a0X18z' 'a0X7C4z' % %%% Binary numbers: % >> E = {'a11111000100z', 'a101z', 'a000000000011000z', 'a1111z'}; % >> sort(E) % ans = 'a000000000011000z' 'a101z' 'a11111000100z' 'a1111z' % >> natsort(E, '[01]+', '%b') % ans = 'a101z' 'a1111z' 'a000000000011000z' 'a11111000100z' % %%% Case sensitivity: % >> F = {'a2', 'A20', 'A1', 'a10', 'A2', 'a1'}; % >> natsort(F, [], 'ignorecase') % default % ans = 'A1' 'a1' 'a2' 'A2' 'a10' 'A20' % >> natsort(F, [], 'matchcase') % ans = 'A1' 'A2' 'A20' 'a1' 'a2' 'a10' % %%% Sort order: % >> G = {'2', 'a', '', '3', 'B', '1'}; % >> natsort(G, [], 'ascend') % default % ans = '' '1' '2' '3' 'a' 'B' % >> natsort(G, [], 'descend') % ans = 'B' 'a' '3' '2' '1' '' % >> natsort(G, [], 'num> natsort(G, [], 'char> natsort({'a18446744073709551615z', 'a18446744073709551614z'}, [], '%lu') % ans = 'a18446744073709551614z' 'a18446744073709551615z' % %% Input and Output Arguments %% % %%% Inputs (**==default): % X = Array to be sorted into alphanumeric order. Can be a % string array, or a cell array of char vectors, or a categorical % array, or any other array type which can be converted by CELLSTR. % rgx = Regular expression to match number substrings, '\d+'** % = [] uses the default regular expression. % can be entered in any order, as many as required: % = Sort direction: 'descend'/'ascend'** % = NaN/number order: 'NaN cell array must contain only character vectors.') Y = X(:); elseif ischar(X) % Convert char matrix: Y = cellstr(X); else % Convert string, categorical, datetime, etc.: Y = cellstr(X(:)); end % if nargin<2 || isnumeric(rgx) && isequal(rgx,[]) rgx = '\d+'; elseif ischar(rgx) assert(ndims(rgx)<3 && size(rgx,1)==1,... 'SC:natsort:rgx:NotCharVector',... 'Second input character vector must have size 1xN.') %#ok nsChkRgx(rgx) else rgx = ns1s2c(rgx); assert(ischar(rgx),... 'SC:natsort:rgx:InvalidType',... 'Second input must be a character vector or a string scalar.') nsChkRgx(rgx) end % varargin = cellfun(@ns1s2c, varargin, 'uni',false); % assert(all(fun(varargin)),... 'SC:natsort:option:InvalidType',... 'All optional arguments must be character vectors or string scalars.') % % Character case: ccm = strcmpi(varargin,'matchcase'); ccx = strcmpi(varargin,'ignorecase')|ccm; % Sort direction: sdd = strcmpi(varargin,'descend'); sdx = strcmpi(varargin,'ascend')|sdd; % Char/num order: orb = strcmpi(varargin,'charnum') nsAssert(1,varargin,nax, 'NanNumOrder', 'NaN<->num') nsAssert(1,varargin,sdx,'SortDirection','Sort direction') nsAssert(1,varargin,sfx,'sscanfFormat', 'SSCANF format') nsAssert(0,varargin,~(ccx|sdx|orx|nax|sfx),'UnusedOption') % % SSCANF format: if nnz(sfx) fmt = varargin{sfx}; else fmt = '%f'; end % %% Identify and Convert Numbers %% % [nbr,spl] = regexpi(Y(:),rgx, 'match','split', varargin{ccx}); % if numel(nbr) tmp = [nbr{:}]; if strcmp(fmt,'%b') tmp = regexprep(tmp,'^0[Bb]',''); vec = cellfun(@(s)pow2(numel(s)-1:-1:0)*sscanf(s,'%1d'),tmp); else vec = sscanf(sprintf(' %s',tmp{:}),fmt); end assert(numel(vec)==numel(tmp),... 'SC:natsort:sscanf:TooManyValues',... 'The %s format must return one value for each input number.',fmt) else vec = []; end % %% Allocate Data %% % % Determine lengths: nmx = numel(Y); lnn = cellfun('length',nbr); lns = cellfun('length',spl); mxs = max(lns); % % Allocate data: idn = logical(bsxfun(@le,1:mxs,lnn).'); % TRANSPOSE bug loses type (R2013b) ids = logical(bsxfun(@le,1:mxs,lns).'); % TRANSPOSE bug loses type (R2013b) arn = zeros(mxs,nmx,class(vec)); ars = cell(mxs,nmx); ars(:) = {''}; ars(ids) = [spl{:}]; arn(idn) = vec; % %% Debugging Array %% % if nargout>2 mxw = 0; for k = 1:nmx mxw = max(mxw,numel(nbr{k})+nnz(~cellfun('isempty',spl{k}))); end dbg = cell(nmx,mxw); for k = 1:nmx tmp = spl{k}; tmp(2,1:end-1) = num2cell(arn(idn(:,k),k)); tmp(cellfun('isempty',tmp)) = []; dbg(k,1:numel(tmp)) = tmp; end end % %% Sort Columns %% % if ~any(ccm) % ignorecase ars = lower(ars); end % if any(orb) % char must be a regular expression that matches numbers.',... '\nThe provided input "%s" is one the optional arguments (inputs 3+).'],rgx) if isempty(regexpi('0',rgx,'once')) warning('SC:natsort:rgx:SanityCheck',... ['Second input must be a regular expression that matches numbers.',... '\nThe provided regular expression "%s" does not match "0", which\n',... 'may be acceptable (e.g. if literals, quantifiers, or lookarounds are used).'],rgx) end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsChkRgx function arr = ns1s2c(arr) % If scalar string then extract the character vector, otherwise data is unchanged. if isa(arr,'string') && isscalar(arr) arr = arr{1}; end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%ns1s2c function nsAssert(val,inp,idx,ids,varargin) % Throw an error if an option is overspecified. if nnz(idx)>val tmp = {'Unknown input arguments',... ' option may only be specified once. Provided inputs'}; error(sprintf('SC:natsort:option:%sOverspecified',ids),... '%s:%s',[varargin{:},tmp{1+val}],sprintf('\n''%s''',inp{idx})) end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsAssert function grp = nsGroup(vec) % Groups in a cell array of char vectors, equivalent to [~,~,grp]=unique(vec); [vec,idx] = sort(vec); grp = cumsum([true,~strcmp(vec(1:end-1),vec(2:end))]); grp(idx) = grp; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsGroup