;----------------------------------------------------------------------------
; $Id: complexity_2.pro,v 1.14 2002/03/13 01:48:37 johnny Exp johnny $
;+
; NAME:
; COMPLEXITY_2
;
; PURPOSE:
; Calculate the second-order complexity, as defined by Elsner and Tsonis
; (1993), for finite word length length_n. Full reference is below in
; the "Notes" section.
;
; CATEGORY:
; Statistics.
;
; CALLING SEQUENCE:
; Result = COMPLEXITY_2(in_length_n, in_sequence)
;
; INPUTS:
; in_length_n: Word length for which to calculate the second-order
; complexity for finite word length (scalar long).
; Variable unchanged by routine.
;
; in_sequence: Input sequence vector. Each element is of value '0'
; or '1' (string vector). Variable unchanged by routine.
;
; KEYWORD PARAMETERS:
; LOGBASE: The default base of the logarithmic operator that is
; used in the equation to calculate second-order complexity
; is e. To choose a different base, set the Logbase key-
; word to that new base (any integer or floating type).
; Input variable. Unchanged.
;
; NIF: Number of irreducible forbidden words of length length_n.
; Scalar long. Output variable. Created.
;
; VERBOSE: If keyword is set true, additional diagnostic messages
; are printed to stdout.
;
; OUTPUTS:
; Result: Calculated second-order complexity (scalar float).
;
; FILE DEVICE I/O:
; None.
;
; COMMON BLOCKS:
; None.
;
; EXAMPLE:
; Enter in a simple data sequence, run routine, and print results:
; data = ['0','0','1','0','1','0','1']
; coeff = COMPLEXITY_2(3, data, Logbase=2., Nif=num)
; PRINT, Format='("C(2) = ",g0,". N(if) = ",g0)', coeff, num
; IDL prints:
; C(2) = 0.333333. N(if) = 2
;
; MODIFICATION HISTORY:
; - 1 Feb 2002: Orig. ver. Johnny Lin, CIRES/University of Colorado.
; Email: air_jlin@yahoo.com. Passed minimally passably adequate tests.
; - 28 Feb 2002: Speed up by removing CMREPLICATE from within a loop.
; Passed passably adequate tests.
; - 5 Mar 2002: Speed up by checking only one word length at a time.
; Passed passably adequate tests.
; - 11 Mar 2002: Change method of checking only one word length at a time
; to include finding the forbidden words too, which is critical to saving
; memory for long word lengths. Passed passably adequate tests.
; - 12 Mar 2002: Return to the methodology of the 1 Feb 2002 version, but
; alter to remove CMREPLICATE calls in such a way that enables memory
; use to be reasonable at larger word lengths (versions from 28 Feb to
; 11 Mar 2002 all had heavy memory use problems). Passed passably adequate
; tests.
;
; NOTES:
; - Written for IDL 5.5. May work with version 5.4 (haven't tested yet).
; - Reference: Elsner, J. B. and A. A. Tsonis (1993), "Complexity and
; predictability of hourly precipitation," J. Atmos. Sci., Vol. 50,
; No. 3, pp. 400-405.
; - All keyword parameters are optional unless otherwise stated.
; - No procedures called with _Extra keyword invoked.
; - User-written procedures called: CMREPLICATE (by Craig B. Markwardt),
; FIND_FORBID, FIND_WORDS
;-
; Copyright (c) 2002 Johnny Lin. For licensing, distribution conditions,
; and contact information, see http://www.johnny-lin.com/lib.html.
;----------------------------------------------------------------------------
FUNCTION COMPLEXITY_2, in_length_n, in_sequence $
, LOGBASE = in_logbase $
, NIF = out_nif $
, VERBOSE = verbose $
, _EXTRA = extra
; -------------------- Error Check and Parameter Setting --------------------
ON_ERROR, 0
FORWARD_FUNCTION CMREPLICATE, FIND_FORBID, FIND_WORDS
length_n = LONG(in_length_n) ;- protect small input param.(s)
if (N_ELEMENTS(in_logbase) ne 0) then begin
logbase = in_logbase
if (N_ELEMENTS(logbase) ne 1) then $
MESSAGE, 'error--bad logbase'
if (logbase lt 1.0) then $
MESSAGE, 'error--logbase lt 1'
endif
NS = N_ELEMENTS(in_sequence) ;- no. of elem. in in_sequence
if (SIZE(in_sequence, /Type) ne 7) then $ ;- error check other inputs
MESSAGE, 'error--bad input sequence type'
if (SIZE(in_sequence, /N_Dimensions) ne 1) then $
MESSAGE, 'error--input sequence not a vector'
if ((length_n le 0) or (length_n gt NS)) then $
MESSAGE, 'error--length_n wrong'
if (length_n gt 30L) then $
MESSAGE, 'error--word length too big'
tmp = WHERE((in_sequence ne '0') and (in_sequence ne '1'), count)
if (count ne 0) then $
MESSAGE, 'error--bad values in input sequence'
; -------------- Find All Forbidden Words of Length length_n ----------------
;
; Selected variable key (for these described variables, assume scope is
; entire routine, unless the variable name contains "tmp"):
; wfln_count Number of words in words_forbid_ln (a key output
; variable from this section).
; words_forbid_ln Forbidden words of length length_n (a key output
; variable from this section).
; words_found_ln Non-forbidden words of length length_n.
PRINT, Format='("COMPLEXITY_2: Finding all forbidden words of ' $
+ 'length ", g0)', length_n
words_found_ln = FIND_WORDS(length_n, in_sequence)
words_forbid_ln = FIND_FORBID( words_found_ln $
, Count=wfln_count, Verbose=verbose )
; ----------- Find All Forbidden Words of Length [1,length_n-1] -------------
;
; Selected variable key (for these described variables, assume scope is
; entire routine, unless the variable name contains "tmp"):
; n_forbid_all Number of all forbidden sub-words (sub-words are words
; that have length less than length_n). This is the 1st
; dimension of the indexing (forbid_cmp_ind and
; found_cmp_ind) variables. A key output of this
; section.
; tmp_forbid Forbidden words in the entire input sequence for
; length ilw.
; tmp_found Non-forbidden words in the entire input sequence for
; length ilw.
; words_forbid_all Forbidden sub-words of the entire input sequence,
; sub-words meaning all words less than length length_n.
; NB: This is a 1-D array, each element of which is a
; word. This differs from most other array of words,
; which are 2-D and in which each row is a word. This
; is done b/c words_forbid_all can contain words of
; differing lengths (this variable is a key output from
; this section).
if (wfln_count ne 0L) then begin ;* skip sect. if no forbidd. words (begin)
PRINT, 'COMPLEXITY_2: Finding all forbidden sub-words'
for ilw=1L,(length_n-1L) do begin ;- consid. all sub-words (begin)
tmp_found = FIND_WORDS(ilw, in_sequence)
tmp_forbid = FIND_FORBID(tmp_found, Count=count, Verbose=verbose)
if (count ne 0) then begin
if (N_ELEMENTS(words_forbid_all) eq 0L) then $
words_forbid_all = STRJOIN(tmp_forbid) $
else $
words_forbid_all = [ words_forbid_all, STRJOIN(tmp_forbid) ]
endif
endfor ;- consid. all sub-words (end)
n_forbid_all = N_ELEMENTS(words_forbid_all) ;- set n_forbid_all
tmp_forbid = 0 ;- dealloc. to save memory
tmp_found = 0
endif ;* skip sect. if no forbidd. words (end)
; --------- Word Count to Help Us Create the Proper Indexing Arrays ---------
;
; The maximum value of all sub-words for each of each of the forbidden words
; of length_n in words_forbid_ln yields the 2nd dimension for the indexing
; array (forbid_cmp_ind and found_cmp_ind) variables. To get this number
; would result in going through all words twice, when max_n_found_all is
; not that large to begin with. Thus, we calculated offline the value of
; max_n_found_all for 2000 realizations of a length n string of '0' and '1',
; generated from a uniform random variable. The results are:
; n = 7 => max_n_found_all = 20; n = 18 => max_n_found_all = 133
; n = 12 => max_n_found_all = 58; n = 20 => max_n_found_all = 165
; n = 13 => max_n_found_all = 68; n = 25 => max_n_found_all = 260
; n = 14 => max_n_found_all = 79; n = 29 => max_n_found_all = 338
; n = 15 => max_n_found_all = 91; n = 30 => max_n_found_all = 379
; The standard deviation for the n=30 case is 10.14. In this section a
; case statement is used to set max_n_found_all to the max. value in the
; length_n range, plus 10.
;
; Selected variable key:
; max_n_found_all The maximum value of the number of sub-words for any
; forbidden word. The key output of this section.
if (wfln_count ne 0L) then begin ;* skip sect. if no forbidd. words (begin)
case 1 of
(length_n le 7) : max_n_found_all = 30L
(length_n ge 8) and (length_n le 12) : max_n_found_all = 68L
(length_n ge 13) and (length_n le 15) : max_n_found_all = 101L
(length_n ge 16) and (length_n le 18) : max_n_found_all = 143L
(length_n ge 19) and (length_n le 20) : max_n_found_all = 175L
(length_n ge 21) and (length_n le 25) : max_n_found_all = 270L
(length_n ge 26) and (length_n le 30) : max_n_found_all = 389L
else : MESSAGE, 'error--bad length_n'
endcase
endif ;* skip sect. if no forbidd. words (end)
; --------------------------- Create Indice Arrays --------------------------
;
; Here we create the indice arrays that will be used to extract the needed
; information for the comparison arrays (found_cmp and forbid_cmp). Checks
; are made to try and conserve memory, because these indice arrays tend to
; be large. The indice arrays only need to be calculated once in this
; routine.
;
; Selected variable key: (for these described variables, unless otherwise
; noted, assume scope is entire routine):
; int_flag Flag says what kind of integers will be used as array
; indices. Has values of either 'short' or 'long'.
; forbid_cmp_ind The indices which will be used to put the elements of
; words_forbid_ilw into the shape of forbid_cmp. Dim-
; ensioned (n_forbid_all, max_n_found_all) with each row
; equal to max_ind (i.e. 0,1,2,3,...).
; found_cmp_ind The indices which will be used to put the elements of
; words_found_cw into the shape of found_cmp. Dimen-
; sioned (n_forbid_all, max_n_found_all) with each col-
; umn equal to max_ind (i.e. 0,1,2,3,...).
; The descriptions of some of the variables referenced in the variable key
; above are found in the next section. If there are no forbidden words in
; the sequence (i.e. if wfln_count equals 0), or if either n_forbid_all or
; max_n_found_all are zero, then this section doesn't need to be run.
if (wfln_count ne 0L) then begin ;- set flag such that
execute_section = (n_forbid_all ne 0) and $ ; if true, execute the
(max_n_found_all ne 0) ; section below
endif else begin
execute_section = 0
endelse
if execute_section then begin ;* skip section if execute_section
; is false (begin)
if ( (n_forbid_all lt 32700) and $ ;- int_flag gets 'short'
(max_n_found_all lt 32700) ) then begin ; if any one dimension is
int_flag = 'short' ; less than c. max. 16-bit
endif else begin ; integer
int_flag = 'long'
PRINT, 'COMPLEXITY_2: Warning: Routine will use a lot of memory.'
endelse
if ((n_forbid_all ne 0) and (max_n_found_all ne 0)) then begin
case int_flag of
'short': max_ind = INDGEN( MAX([n_forbid_all $
, max_n_found_all]) )
'long': max_ind = LINDGEN( MAX([n_forbid_all $
, max_n_found_all]) )
else: MESSAGE, 'error--bad int_flag'
endcase
forbid_cmp_ind = CMREPLICATE( max_ind[0L:n_forbid_all-1L] $
, max_n_found_all )
found_cmp_ind = TRANSPOSE( $
CMREPLICATE( max_ind[0L:max_n_found_all-1L] $
, n_forbid_all ) )
max_ind = 0L ;- dealloc. to save memory
endif
endif ;* skip section if execute_section
; is false (end)
; -------------------- Irreducible Forbidden Word Count ---------------------
;
; Algorithm: For each forbidden word of length length_n (words_forbid_ln),
; determine whether it is irreducible. To determine whether it is irredu-
; cible, for each forbidden word, see whether it contains a shorter for-
; bidden word (i.e. any shorter forbidden word as defined by the original
; input sequence). If it does, the original forbidden word is reducible.
; If it does not, it is irreducible. A unique case is if there are no
; forbidden sub-words in the original input sequence. In that case, any
; forbidden word of length length_n should be irreducible, since no for-
; bidden sub-words exist that can make up the length length_n word. (I
; think this is true, but I'm not entirely sure, so a warning is printed
; if this condition exists.)
;
; Another way to think about it is that every element in forbid_cmp and
; found_cmp (at every word length) must be not equal in order to increment
; NIFLN.
;
; If there are no forbidden words of length length_n, then there are
; obviously also no irreducible forbidden words.
;
; Selected variable key (for these described variables, assume scope is
; entire routine, unless the variable name contains "tmp"):
; check_word A forbidden word of length length_n being examined to
; see if it is irreducible.
; NIFLN Number of irreducible forbidden words of length
; length_n (this is the key output variable from this
; section).
; forbid_cmp 2-D matrix expansion of words_forbid_all, used for
; comparing with all possibilities of match with the
; values in words_found_cw. Dimensioned (n_forbid_all,
; n_found).
; found_cmp 2-D matrix expansion of words_found_cw, used for
; comparing with all possibilities of match with the
; values in words_forbid_all. Dimensioned
; (n_forbid_all, n_found).
; words_found_cw Non-forbidden sub-words of the sequence defined by
; check_word. NB: This is a 1-D array, each element
; of which is a word. This differs from most other
; array of words, which are 2-D and in which each row
; is a word. This is done b/c words_found_cw can
; contain words of differing lengths.
; tmp_found Non-forbidden words in the sequence defined by
; check_word, for length ilw.
if (wfln_count eq 0L) then begin ;* case 1 (begin): no forbidd. words
; => no irreduc. forbidd. words
NIFLN = 0L
endif $ ;* case 1 (end)
else begin ;* case 2 (begin): there are forbidd.
; words of length length_n
NIFLN = 0L ;- initialize NIFLN
for i=0L,wfln_count-1L do begin ;- consid. each forbidden word (begin)
PRINT, Format='("COMPLEXITY_2: Analyzing word number: "' $
+ ', (g0), " of ", (g0))', i+1L, wfln_count
check_word = REFORM(words_forbid_ln[*,i])
for ilw=1L,(length_n-1L) do begin ;+ consid. all sub-words (begin)
tmp_found = FIND_WORDS(ilw, check_word) ;> find all
if (N_ELEMENTS(words_found_cw) eq 0L) then $ ; sub-words
words_found_cw = STRJOIN(tmp_found) $ ; in check_word
else $
words_found_cw = [ words_found_cw $
, STRJOIN(tmp_found) ]
endfor ;+ consid. all sub-words (end)
;+ check if any sub-word of check_word is forbidden in the entire input
; sequence. if n_found is greater than max_n_found_all, then the *_cmp
; arrays are calculated using CMREPLICATE, which should not happen very
; often (because if it does, the program slows down by a lot).
n_found = N_ELEMENTS(words_found_cw)
if ((n_forbid_all ne 0L) and (n_found ne 0L)) then begin
if (n_found le max_n_found_all) then begin
forbid_cmp = $
words_forbid_all[ $
forbid_cmp_ind[ 0L:n_forbid_all-1L, 0L:n_found-1L ] ]
found_cmp = $
words_found_cw[ $
found_cmp_ind[ 0L:n_forbid_all-1L, 0L:n_found-1L ] ]
endif else begin
PRINT, 'COMPLEXITY_2: Warning: Preset ' $
+ 'max_n_found_all too small. Compensating.'
forbid_cmp = CMREPLICATE( words_forbid_all, n_found )
found_cmp = TRANSPOSE( $
CMREPLICATE( words_found_cw, n_forbid_all ) )
endelse
eqflag = ARRAY_EQUAL(found_cmp ne forbid_cmp, 1B)
if (eqflag eq 1) then begin
NIFLN = NIFLN + 1L
endif
tmp = TEMPORARY(eqflag) ;> dealloc. as a safety check
forbid_cmp = 0 ;> dealloc. to save memory
found_cmp = 0
endif else begin
PRINT, 'COMPLEXITY_2: Warning: No forbidden sub-words exist.'
NIFLN = NIFLN + 1L
endelse
tmp = TEMPORARY(words_found_cw) ;+ dealloc. to get ready for the
tmp = 0 ; next check_word
endfor ;- consid. each forbidden word (end)
endelse ;* case 2 (end)
; -------------------------- Calculate Complexity ---------------------------
;
; Notes: complexity_value is the eqn. for second-order complexity, for
; length_n -> infinity. The default (i.e. logbase undefined) uses base e.
; If logbase is defined, complexity_value is calculated using that as the
; logabase as the base. If there are no irreducible forbidden words, the
; second-order complexity equals zero.
if (NIFLN eq 0L) then begin
complexity_value = 0.0 ;- for no irreduc. forbid. words
endif else begin
if (N_ELEMENTS(logbase) eq 0) then $ ;- 2nd-order cmplxty, if
complexity_value = ALOG(NIFLN) / length_n $ ; length_n -> infinity
else $
complexity_value = ALOG(NIFLN) / (ALOG(logbase)*length_n)
endelse
; ----------------------------- Prepare Output ------------------------------
out_nif = LONG(NIFLN) ;- output NIF (optional)
Result = TEMPORARY(complexity_value) ;- return complexity_value
RETURN, Result
END ;=== end of function ===
; ========== end file ==========