;----------------------------------------------------------------------------
; $Id: find_words.pro,v 1.11 2002/03/12 03:14:50 johnny Exp $
;+
; NAME:
;   FIND_WORDS
;
; PURPOSE:
;   Given a symbolic sequence of 0's and 1's, find the unique "words" in 
;   that sequence, for finite word length length_n.  See "Notes" for a 
;   useful reference.
;
; CATEGORY:
;   Statistics.
;
; CALLING SEQUENCE:
;   Result = FIND_WORDS(in_length_n, in_sequence)
;
; INPUTS:
;   in_length_n:   Word length for which to count the number of unique
;                  words in the input sequence.  Scalar long.
;
;   in_sequence:   Input sequence vector.  Each element is of value '0' 
;                  or '1'.  String vector.
;   
; KEYWORD PARAMETERS:
;   COUNT:         If set to a variable, the number of words found in
;                  Result is returned.  Scalar long.  Created.
;
; OUTPUTS:
;   Result:        Array of unique words in the sequence, dimensioned 
;                  (length_n, number of unique words found).  Each row
;                  of the array is a unique word found in the input
;                  sequence.  Each element is either '0' or '1'.  String 
;                  array.
;
; FILE DEVICE I/O:
;   None.
;
; COMMON BLOCKS:
;   None.
;
; EXAMPLE:
;   This is an example almost identical to the periodic test comparison 
;   case shown in Elsner and Tsonis's (1993) Fig. 2.
;   Create periodic data:
;       data = REPLICATE('0', 14)
;       data = [data, '1']
;       for i=0,14 do data = [data, data]
;       data = data[0L:344786L]
;       word_array = FIND_WORDS(5L, data)
;       PRINT, word_array
;   IDL prints:
;       0 0 0 0 0
;       0 0 0 0 1
;       0 0 0 1 0
;       0 0 1 0 0
;       0 1 0 0 0
;       1 0 0 0 0
;
; MODIFICATION HISTORY:
; - 25 Jan 2002:  Orig. ver. Johnny Lin, CIRES/University of Colorado.
;   Email:  air_jlin@yahoo.com.  Passed reasonably adequate tests.
; - 30 Jan 2002:  Additional exit point added in case one finds all
;   theoretically possible words early on.  Passed passably adequate
;   tests.
; - 27 Feb 2002:  Algorithm for finding words is revamped to increase
;   computational speed for large length_n.  Only one function exit point 
;   is kept.  Passed passably adequate tests.
; - 1 Mar 2002:  Add Count keyword.  Passed minimally passably adequate 
;   tests.
;
; NOTES:
; - Written for IDL 5.5.  May work with versions 5.2.1-5.4 (haven't
;   tested yet).
; - Reference:  Elsner, J. B. and A. A. Tsonis (1993), "Complexity and
;   predictability of hourly precipitation," J. Atmos. Sci., Vol. 50,
;   No. 3, pp. 400-405.
; - Because this function requires in_sequence to be longer than 
;   in_length_n, at least one word will be always found by the routine.
;   Thus, there is no check for the case of no words found.
; - All keyword parameters are optional unless otherwise stated.
; - No procedures called with _Extra keyword invoked.
; - User-written procedures called:  BIN2DEC_JWL, DEC2BIN (both originally
;   from the SolarSoft library; the first has revisions, while the second
;   does not).
;-
; Copyright (c) 2002 Johnny Lin.  For licensing, distribution conditions,
; and contact information, see http://www.johnny-lin.com/lib.html.
;----------------------------------------------------------------------------

FUNCTION FIND_WORDS, in_length_n, in_sequence  $
                   , COUNT   = out_count  $
                   , _EXTRA  = extra


; -------------------- Error Check and Parameter Setting --------------------

ON_ERROR, 0

length_n = LONG(in_length_n)                 ;- protect small input param.(s)
NS = N_ELEMENTS(in_sequence)                 ;- no. of elem. in in_sequence


if (SIZE(in_sequence, /Type) ne 7) then  $   ;- error check other inputs
   MESSAGE, 'error--bad input sequence type'
if (SIZE(in_sequence, /N_Dimensions) ne 1) then  $
   MESSAGE, 'error--input sequence not a vector'
if ((length_n le 0) or (length_n gt NS)) then  $
   MESSAGE, 'error--length_n wrong'

tmp = WHERE((in_sequence ne '0') and (in_sequence ne '1'), count)
if (count ne 0) then  $
   MESSAGE, 'error--bad values in input sequence'


; ------------------------------- Word Search -------------------------------
;
; Algorithm:  Find all words that possibly exist in the sequence and 
; translate these words to their integer value.  Find only the unique
; values of those words.  Translate back to binary.
;
; Selected key variables:
;    all_seq_long      Array of long value of all words in in_sequence.
;    uniq_long         Only the unique values of all_seq_long.
;    words_found_byte  Version of words_found as a byte array.

NSW = NS-length_n+1L  ;- no. of word chunks of length length_n in in_sequence

all_seq_long = LONARR(NSW)     ;- declare array
i1 = LINDGEN(NSW)              ;- beginning index of each found word
i2 = i1+length_n-1L            ;- ending index of each found word

for i=0L,NS-length_n do begin                            ;- find all words
    BIN2DEC_JWL, in_sequence[ i1[i]:i2[i] ], tmpout  $   ;  in in_sequence
               , /Quiet
    all_seq_long[i] = tmpout
endfor

uniq_long = all_seq_long[ UNIQ(all_seq_long, SORT(all_seq_long)) ]

DEC2BIN, uniq_long, words_found_byte, /Quiet     ;- translate back to binary


; ------------------------ Convert Output to String -------------------------
;
; Note:  If words_found_byte contains only one word, then we add a degen-
; erate dimension.
;
; Selected key variables:
;    words_found   This variable becomes Result (see documentation above
;                  for details).

dim_wfb = SIZE(words_found_byte, /Dimensions)     ;- dim of words_found_byte
if (N_ELEMENTS(dim_wfb) ne 2) then begin
   if (N_ELEMENTS(dim_wfb) eq 1) then begin
      words_found_byte = REFORM( words_found_byte  $              ;+ reform
                               , N_ELEMENTS(words_found_byte)  $
                               , 1, /Overwrite )
      dim_wfb = SIZE(words_found_byte, /Dimensions)               ;+ redo dim
   endif else begin
      MESSAGE, 'error--bad words_found_byte array'
   endelse
endif

words_found = REPLICATE('0', dim_wfb[0], dim_wfb[1])   ;- change words_found
pts = WHERE(words_found_byte eq 1B, count)             ;  from byte to string
if (count gt 0L) then words_found[pts] = '1'

words_found = words_found[(dim_wfb[0]-length_n):*, *]  ;- trunc. lead. 0's


; ----------------------------- Prepare Output ------------------------------

sd_wf  = SIZE(words_found)                    ;- count number of words
case sd_wf[0] of                              ;  (optional keyword output)
     1:  out_count = 1L
     2:  out_count = sd_wf[2]
     else:  MESSAGE, 'error--bad out_count'
endcase

Result = TEMPORARY(words_found)

RETURN, Result


END     ;=== end of function ===
 
 
; ========== end file ==========