;----------------------------------------------------------------------------
; $Id: complexity_2.pro,v 1.14 2002/03/13 01:48:37 johnny Exp johnny $
;+
; NAME:
;   COMPLEXITY_2
;
; PURPOSE:
;   Calculate the second-order complexity, as defined by Elsner and Tsonis
;   (1993), for finite word length length_n.  Full reference is below in 
;   the "Notes" section.
;
; CATEGORY:
;   Statistics.
;
; CALLING SEQUENCE:
;   Result = COMPLEXITY_2(in_length_n, in_sequence)
;
; INPUTS:
;   in_length_n:   Word length for which to calculate the second-order
;                  complexity for finite word length (scalar long).
;                  Variable unchanged by routine.
;
;   in_sequence:   Input sequence vector.  Each element is of value '0' 
;                  or '1' (string vector).  Variable unchanged by routine.
;   
; KEYWORD PARAMETERS:
;   LOGBASE:       The default base of the logarithmic operator that is
;                  used in the equation to calculate second-order complexity
;                  is e.  To choose a different base, set the Logbase key-
;                  word to that new base (any integer or floating type).
;                  Input variable.  Unchanged.
;
;   NIF:           Number of irreducible forbidden words of length length_n.
;                  Scalar long.  Output variable.  Created.
;
;   VERBOSE:       If keyword is set true, additional diagnostic messages 
;                  are printed to stdout.
;
; OUTPUTS:
;   Result:        Calculated second-order complexity (scalar float).
;
; FILE DEVICE I/O:
;   None.
;
; COMMON BLOCKS:
;   None.
;
; EXAMPLE:
;   Enter in a simple data sequence, run routine, and print results:
;       data  = ['0','0','1','0','1','0','1']
;       coeff = COMPLEXITY_2(3, data, Logbase=2., Nif=num)
;       PRINT, Format='("C(2) = ",g0,".  N(if) = ",g0)', coeff, num
;   IDL prints:
;       C(2) = 0.333333.  N(if) = 2
;
; MODIFICATION HISTORY:
; - 1 Feb 2002:  Orig. ver. Johnny Lin, CIRES/University of Colorado.
;   Email:  air_jlin@yahoo.com.  Passed minimally passably adequate tests.
; - 28 Feb 2002:  Speed up by removing CMREPLICATE from within a loop.
;   Passed passably adequate tests.
; - 5 Mar 2002:  Speed up by checking only one word length at a time.
;   Passed passably adequate tests.
; - 11 Mar 2002:  Change method of checking only one word length at a time
;   to include finding the forbidden words too, which is critical to saving
;   memory for long word lengths.  Passed passably adequate tests.
; - 12 Mar 2002:  Return to the methodology of the 1 Feb 2002 version, but
;   alter to remove CMREPLICATE calls in such a way that enables memory
;   use to be reasonable at larger word lengths (versions from 28 Feb to
;   11 Mar 2002 all had heavy memory use problems).  Passed passably adequate
;   tests.
;
; NOTES:
; - Written for IDL 5.5.  May work with version 5.4 (haven't tested yet).
; - Reference:  Elsner, J. B. and A. A. Tsonis (1993), "Complexity and
;   predictability of hourly precipitation," J. Atmos. Sci., Vol. 50,
;   No. 3, pp. 400-405.
; - All keyword parameters are optional unless otherwise stated.
; - No procedures called with _Extra keyword invoked.
; - User-written procedures called:  CMREPLICATE (by Craig B. Markwardt),
;   FIND_FORBID, FIND_WORDS
;-
; Copyright (c) 2002 Johnny Lin.  For licensing, distribution conditions,
; and contact information, see http://www.johnny-lin.com/lib.html.
;----------------------------------------------------------------------------

FUNCTION COMPLEXITY_2, in_length_n, in_sequence  $
                     , LOGBASE = in_logbase  $
                     , NIF     = out_nif  $
                     , VERBOSE = verbose  $
                     , _EXTRA  = extra


; -------------------- Error Check and Parameter Setting --------------------

ON_ERROR, 0

FORWARD_FUNCTION CMREPLICATE, FIND_FORBID, FIND_WORDS

length_n = LONG(in_length_n)                 ;- protect small input param.(s)

if (N_ELEMENTS(in_logbase) ne 0) then begin
   logbase = in_logbase
   if (N_ELEMENTS(logbase) ne 1) then  $
      MESSAGE, 'error--bad logbase'
   if (logbase lt 1.0) then  $
      MESSAGE, 'error--logbase lt 1'
endif

NS = N_ELEMENTS(in_sequence)                 ;- no. of elem. in in_sequence


if (SIZE(in_sequence, /Type) ne 7) then  $   ;- error check other inputs
   MESSAGE, 'error--bad input sequence type'
if (SIZE(in_sequence, /N_Dimensions) ne 1) then  $
   MESSAGE, 'error--input sequence not a vector'
if ((length_n le 0) or (length_n gt NS)) then  $
   MESSAGE, 'error--length_n wrong'
if (length_n gt 30L) then  $
   MESSAGE, 'error--word length too big'


tmp = WHERE((in_sequence ne '0') and (in_sequence ne '1'), count)
if (count ne 0) then  $
   MESSAGE, 'error--bad values in input sequence'


; -------------- Find All Forbidden Words of Length length_n ----------------
;
; Selected variable key (for these described variables, assume scope is 
; entire routine, unless the variable name contains "tmp"):
;   wfln_count          Number of words in words_forbid_ln (a key output
;                       variable from this section).
;   words_forbid_ln     Forbidden words of length length_n (a key output
;                       variable from this section).
;   words_found_ln      Non-forbidden words of length length_n.

PRINT, Format='("COMPLEXITY_2:  Finding all forbidden words of '  $
     + 'length ", g0)', length_n

words_found_ln  = FIND_WORDS(length_n, in_sequence)
words_forbid_ln = FIND_FORBID( words_found_ln  $
                             , Count=wfln_count, Verbose=verbose )


; ----------- Find All Forbidden Words of Length [1,length_n-1] -------------
;
; Selected variable key (for these described variables, assume scope is 
; entire routine, unless the variable name contains "tmp"):
;   n_forbid_all      Number of all forbidden sub-words (sub-words are words 
;                     that have length less than length_n).  This is the 1st
;                     dimension of the indexing (forbid_cmp_ind and 
;                     found_cmp_ind) variables.  A key output of this 
;                     section.
;   tmp_forbid        Forbidden words in the entire input sequence for 
;                     length ilw.
;   tmp_found         Non-forbidden words in the entire input sequence for
;                     length ilw.
;   words_forbid_all  Forbidden sub-words of the entire input sequence, 
;                     sub-words meaning all words less than length length_n.
;                     NB:  This is a 1-D array, each element of which is a 
;                     word.  This differs from most other array of words, 
;                     which are 2-D and in which each row is a word.  This 
;                     is done b/c words_forbid_all can contain words of 
;                     differing lengths (this variable is a key output from
;                     this section).

if (wfln_count ne 0L) then begin  ;* skip sect. if no forbidd. words (begin)

   PRINT, 'COMPLEXITY_2:  Finding all forbidden sub-words'

   for ilw=1L,(length_n-1L) do begin     ;- consid. all sub-words (begin)

       tmp_found  = FIND_WORDS(ilw, in_sequence)
       tmp_forbid = FIND_FORBID(tmp_found, Count=count, Verbose=verbose)
       if (count ne 0) then begin
          if (N_ELEMENTS(words_forbid_all) eq 0L) then  $
             words_forbid_all = STRJOIN(tmp_forbid)  $
          else  $
             words_forbid_all = [ words_forbid_all, STRJOIN(tmp_forbid) ] 
       endif 

   endfor                                ;- consid. all sub-words (end)

   n_forbid_all = N_ELEMENTS(words_forbid_all)    ;- set n_forbid_all

   tmp_forbid = 0                                 ;- dealloc. to save memory
   tmp_found  = 0

endif                             ;* skip sect. if no forbidd. words (end)


; --------- Word Count to Help Us Create the Proper Indexing Arrays ---------
;
; The maximum value of all sub-words for each of each of the forbidden words 
; of length_n in words_forbid_ln yields the 2nd dimension for the indexing
; array (forbid_cmp_ind and found_cmp_ind) variables.  To get this number
; would result in going through all words twice, when max_n_found_all is
; not that large to begin with.  Thus, we calculated offline the value of
; max_n_found_all for 2000 realizations of a length n string of '0' and '1',
; generated from a uniform random variable.  The results are:
;    n =  7 => max_n_found_all = 20; n = 18 => max_n_found_all = 133
;    n = 12 => max_n_found_all = 58; n = 20 => max_n_found_all = 165
;    n = 13 => max_n_found_all = 68; n = 25 => max_n_found_all = 260
;    n = 14 => max_n_found_all = 79; n = 29 => max_n_found_all = 338
;    n = 15 => max_n_found_all = 91; n = 30 => max_n_found_all = 379
; The standard deviation for the n=30 case is 10.14.  In this section a
; case statement is used to set max_n_found_all to the max. value in the
; length_n range, plus 10.
;
; Selected variable key:
;   max_n_found_all   The maximum value of the number of sub-words for any 
;                     forbidden word.  The key output of this section.

if (wfln_count ne 0L) then begin  ;* skip sect. if no forbidd. words (begin)
   case 1 of
      (length_n le  7)                      :  max_n_found_all = 30L
      (length_n ge  8) and (length_n le 12) :  max_n_found_all = 68L
      (length_n ge 13) and (length_n le 15) :  max_n_found_all = 101L
      (length_n ge 16) and (length_n le 18) :  max_n_found_all = 143L
      (length_n ge 19) and (length_n le 20) :  max_n_found_all = 175L
      (length_n ge 21) and (length_n le 25) :  max_n_found_all = 270L
      (length_n ge 26) and (length_n le 30) :  max_n_found_all = 389L
      else                                  :  MESSAGE, 'error--bad length_n'
   endcase
endif                             ;* skip sect. if no forbidd. words (end)


; --------------------------- Create Indice Arrays --------------------------
;
; Here we create the indice arrays that will be used to extract the needed
; information for the comparison arrays (found_cmp and forbid_cmp).  Checks
; are made to try and conserve memory, because these indice arrays tend to
; be large.  The indice arrays only need to be calculated once in this
; routine.
;
; Selected variable key: (for these described variables, unless otherwise
; noted, assume scope is entire routine):
;   int_flag          Flag says what kind of integers will be used as array
;                     indices.  Has values of either 'short' or 'long'.
;   forbid_cmp_ind    The indices which will be used to put the elements of
;                     words_forbid_ilw into the shape of forbid_cmp.  Dim-
;                     ensioned (n_forbid_all, max_n_found_all) with each row
;                     equal to max_ind (i.e. 0,1,2,3,...).
;   found_cmp_ind     The indices which will be used to put the elements of
;                     words_found_cw into the shape of found_cmp.  Dimen-
;                     sioned (n_forbid_all, max_n_found_all) with each col-
;                     umn equal to max_ind (i.e. 0,1,2,3,...).
; The descriptions of some of the variables referenced in the variable key
; above are found in the next section.  If there are no forbidden words in
; the sequence (i.e. if wfln_count equals 0), or if either n_forbid_all or
; max_n_found_all are zero, then this section doesn't need to be run.

if (wfln_count ne 0L) then begin                 ;- set flag such that
   execute_section = (n_forbid_all ne 0) and  $  ;  if true, execute the
                     (max_n_found_all ne 0)      ;  section below
endif else begin
   execute_section = 0
endelse

if execute_section then begin            ;* skip section if execute_section
                                         ;  is false (begin)

   if ( (n_forbid_all lt 32700) and  $           ;- int_flag gets 'short'
        (max_n_found_all lt 32700) ) then begin  ;  if any one dimension is
      int_flag = 'short'                         ;  less than c. max. 16-bit
   endif else begin                              ;  integer
      int_flag = 'long'
      PRINT, 'COMPLEXITY_2:  Warning:  Routine will use a lot of memory.'
   endelse

   if ((n_forbid_all ne 0) and (max_n_found_all ne 0)) then begin
      case int_flag of
           'short':  max_ind = INDGEN(  MAX([n_forbid_all  $
                                     ,  max_n_found_all]) )
           'long':   max_ind = LINDGEN( MAX([n_forbid_all  $
                                      , max_n_found_all]) )
           else:     MESSAGE, 'error--bad int_flag'
      endcase

      forbid_cmp_ind = CMREPLICATE( max_ind[0L:n_forbid_all-1L]  $
                                  , max_n_found_all )
      found_cmp_ind  = TRANSPOSE(  $
                       CMREPLICATE( max_ind[0L:max_n_found_all-1L]  $
                                  , n_forbid_all ) )

      max_ind = 0L          ;- dealloc. to save memory
   endif

endif                                    ;* skip section if execute_section
                                         ;  is false (end)


; -------------------- Irreducible Forbidden Word Count ---------------------
;
; Algorithm:  For each forbidden word of length length_n (words_forbid_ln),
; determine whether it is irreducible.  To determine whether it is irredu-
; cible, for each forbidden word, see whether it contains a shorter for-
; bidden word (i.e. any shorter forbidden word as defined by the original
; input sequence).  If it does, the original forbidden word is reducible.  
; If it does not, it is irreducible.  A unique case is if there are no
; forbidden sub-words in the original input sequence.  In that case, any
; forbidden word of length length_n should be irreducible, since no for-
; bidden sub-words exist that can make up the length length_n word.  (I
; think this is true, but I'm not entirely sure, so a warning is printed
; if this condition exists.)
;
; Another way to think about it is that every element in forbid_cmp and
; found_cmp (at every word length) must be not equal in order to increment
; NIFLN.
;
; If there are no forbidden words of length length_n, then there are
; obviously also no irreducible forbidden words.
;
; Selected variable key (for these described variables, assume scope is 
; entire routine, unless the variable name contains "tmp"):
;   check_word       A forbidden word of length length_n being examined to 
;                    see if it is irreducible.
;   NIFLN            Number of irreducible forbidden words of length
;                    length_n (this is the key output variable from this
;                    section).
;   forbid_cmp       2-D matrix expansion of words_forbid_all, used for
;                    comparing with all possibilities of match with the
;                    values in words_found_cw.  Dimensioned (n_forbid_all,
;                    n_found).
;   found_cmp        2-D matrix expansion of words_found_cw, used for
;                    comparing with all possibilities of match with the
;                    values in words_forbid_all.  Dimensioned
;                    (n_forbid_all, n_found).
;   words_found_cw   Non-forbidden sub-words of the sequence defined by 
;                    check_word.  NB:  This is a 1-D array, each element
;                    of which is a word.  This differs from most other
;                    array of words, which are 2-D and in which each row
;                    is a word.  This is done b/c words_found_cw can
;                    contain words of differing lengths.
;   tmp_found        Non-forbidden words in the sequence defined by 
;                    check_word, for length ilw.

if (wfln_count eq 0L) then begin     ;* case 1 (begin):  no forbidd. words
                                     ;  => no irreduc. forbidd. words
   NIFLN = 0L

endif  $                             ;* case 1 (end)
else begin                           ;* case 2 (begin):  there are forbidd. 
                                     ;  words of length length_n
   NIFLN = 0L                        ;- initialize NIFLN

   for i=0L,wfln_count-1L do begin   ;- consid. each forbidden word (begin)

       PRINT, Format='("COMPLEXITY_2:  Analyzing word number:  "'  $
            + ', (g0), " of ", (g0))', i+1L, wfln_count
       check_word = REFORM(words_forbid_ln[*,i])

       for ilw=1L,(length_n-1L) do begin     ;+ consid. all sub-words (begin)
           tmp_found = FIND_WORDS(ilw, check_word)          ;> find all
           if (N_ELEMENTS(words_found_cw) eq 0L) then  $    ;  sub-words
              words_found_cw = STRJOIN(tmp_found)  $        ;  in check_word
           else  $
              words_found_cw = [ words_found_cw  $
                               , STRJOIN(tmp_found) ] 
       endfor                                ;+ consid. all sub-words (end)


;+ check if any sub-word of check_word is forbidden in the entire input 
;  sequence.  if n_found is greater than max_n_found_all, then the *_cmp
;  arrays are calculated using CMREPLICATE, which should not happen very
;  often (because if it does, the program slows down by a lot).

       n_found = N_ELEMENTS(words_found_cw)

       if ((n_forbid_all ne 0L) and (n_found ne 0L)) then begin

          if (n_found le max_n_found_all) then begin
             forbid_cmp =  $
                words_forbid_all[  $
                   forbid_cmp_ind[ 0L:n_forbid_all-1L, 0L:n_found-1L ] ]
             found_cmp  =  $
                words_found_cw[  $
                   found_cmp_ind[  0L:n_forbid_all-1L, 0L:n_found-1L ] ]
          endif else begin
             PRINT, 'COMPLEXITY_2:  Warning:  Preset '  $
                  + 'max_n_found_all too small.  Compensating.'
             forbid_cmp = CMREPLICATE( words_forbid_all, n_found )
             found_cmp  = TRANSPOSE(  $
                          CMREPLICATE( words_found_cw, n_forbid_all ) )
          endelse

          eqflag = ARRAY_EQUAL(found_cmp ne forbid_cmp, 1B)
          if (eqflag eq 1) then begin
             NIFLN = NIFLN + 1L
          endif
          tmp = TEMPORARY(eqflag)  ;> dealloc. as a safety check

          forbid_cmp = 0           ;> dealloc. to save memory
          found_cmp  = 0
       endif else begin
          PRINT, 'COMPLEXITY_2:  Warning:  No forbidden sub-words exist.'
          NIFLN = NIFLN + 1L
       endelse

       tmp = TEMPORARY(words_found_cw)     ;+ dealloc. to get ready for the
       tmp = 0                             ;  next check_word

   endfor                            ;- consid. each forbidden word (end)

endelse                              ;* case 2 (end)


; -------------------------- Calculate Complexity ---------------------------
;
; Notes:  complexity_value is the eqn. for second-order complexity, for
; length_n -> infinity.  The default (i.e. logbase undefined) uses base e.
; If logbase is defined, complexity_value is calculated using that as the
; logabase as the base.  If there are no irreducible forbidden words, the
; second-order complexity equals zero.

if (NIFLN eq 0L) then begin
   complexity_value = 0.0                   ;- for no irreduc. forbid. words
endif else begin
   if (N_ELEMENTS(logbase) eq 0) then  $            ;- 2nd-order cmplxty, if
      complexity_value = ALOG(NIFLN) / length_n  $  ;  length_n -> infinity
   else  $
      complexity_value = ALOG(NIFLN) / (ALOG(logbase)*length_n)
endelse


; ----------------------------- Prepare Output ------------------------------

out_nif = LONG(NIFLN)                       ;- output NIF (optional)

Result = TEMPORARY(complexity_value)        ;- return complexity_value

RETURN, Result


END     ;=== end of function ===
 
 
; ========== end file ==========