#ifndef OTC_TEXT_REGEX_HH
#define OTC_TEXT_REGEX_HH
/*
// ============================================================================
//
// = LIBRARY
//     OTC
//
// = FILENAME
//     text/regex.hh
//
// = AUTHOR(S)
//     Graham Dumpleton
// 
// = COPYRIGHT
//     Copyright 1991 1992 OTC LIMITED
//     Copyright 1994 1995 DUMPLETON SOFTWARE CONSULTING PTY LIMITED
//
// ============================================================================
*/

#include <OTC/text/pattern.hh>
#include <OTC/thread/nrmutex.hh>

#ifdef __GNUG__
#if (__GNUC__ >= 3 || __GNUC_MINOR__ >= 6) || defined(CXX_CYGNUS)
#pragma interface "OTC/text/regex.hh"
#else
#pragma interface
#endif
#endif

/* ------------------------------------------------------------------------- */

class OTC_Regex : public OTC_Pattern
    // = TITLE
    //	   Class for performing regular expression matches.
    //
    // = CLASS TYPE
    //	   Concrete
    //
    // = DESCRIPTION
    //	   This class can be used to determine if some string matches a
    //	   particular regular expression. In addition, information can be
    //	   obtained about the match, so that string substitutions can be
    //	   undertaken.
    //
    //	   The pattern style is like that for <ex(1)>.
    //	   
    // = NOTES
    //	   This uses the regex routines in v07i021 of volume 7
    //	   of comp.sources.misc.
    //
    // = REGULAR EXPRESSIONS
    //	   Regular expressions can be of the following forms:
    //
    //	   [1] char
    // = BEGIN<INDENT>
    //	   matches itself, unless it is a special character
    //	   (metachar): . \ [ ] * + ^ $
    // = END<INDENT>
    //
    //	   [2] .
    // = BEGIN<INDENT>
    //	   matches any character.
    // = END<INDENT>
    //
    //	   [3] \ 
    // = BEGIN<INDENT>
    //	   matches the character following it, except when followed by a left
    //	   or right round bracket, a digit 1 to 9, a left or right angle
    //	   bracket or one of the characters "bnfrt". (see [7], [8], [9] and
    //	   [12]) It is used as an escape character for all other
    //	   meta-characters, and itself. When used in a set ([4]), it is
    //	   treated as an ordinary character.
    // = END<INDENT>
    //
    //	   [4] [set]
    // = BEGIN<INDENT>
    //	   matches one of the characters in the set. If the first character
    //	   in the set is "^", it matches a character NOT in the set. A
    //	   shorthand S-E is used to specify a set of characters S upto E,
    //	   inclusive. The special characters "]" and "-" have no special
    //	   meaning if they appear as the first chars in the set.
    //
    //	   examples:
    // = BEGIN<INDENT>
    // = BEGIN<NOFILL>
    //	   [a-z] matches any lowercase alpha
    //	   [^]-] matches any char except ] and -
    //	   [^A-Z] matches any char except uppercase alpha
    //	   [a-zA-Z] matches any alpha
    // = END<NOFILL>
    // = END<INDENT>
    // = END<INDENT>
    //
    //	   [5] *
    // = BEGIN<INDENT>
    //	   any regular expression form [1] to [4], followed by closure char
    //	   (*) matches zero or more matches of that form.
    // = END<INDENT>
    //
    //	   [6] +
    // = BEGIN<INDENT>
    //	   same as [5], except it matches one or more.
    // = END<INDENT>
    //
    //	   [7]
    // = BEGIN<INDENT>
    //	   a regular expression in the form [1] to [10], enclosed as \(form\)
    //	   matches what form matches. The enclosure creates a set of tags,
    //	   used for [8] and for pattern substitution. The tagged forms are
    //	   numbered starting from 1.
    // = END<INDENT>
    //
    //	   [8]
    // = BEGIN<INDENT>
    //	   a \ followed by a digit 1 to 9 matches whatever a previously
    //	   tagged regular expression ([7]) matched.
    // = END<INDENT>
    //
    //	   [9] \\<\\>
    // = BEGIN<INDENT>
    //	   a regular expression starting with a \\< construct and/or ending
    //	   with a \\> construct, restricts the pattern matching to the
    //	   beginning of a word, and/or the end of a word. A word is defined
    //	   to be a character string beginning and/or ending with the
    //	   characters A-Z a-z 0-9 and _. It must also be preceded and/or
    //	   followed by any character outside those mentioned.
    // = END<INDENT>
    //
    //	   [10]
    // = BEGIN<INDENT>
    //	   a composite regular expression xy where x and y are in the form
    //	   [1] to [10] matches the longest match of x followed by a match for
    //	   y.
    // = END<INDENT>
    //
    //	   [11] ^ $
    // = BEGIN<INDENT>
    //	   a regular expression starting with a ^ character and/or ending
    //	   with a $ character, restricts the pattern matching to the
    //	   beginning of the line, or the end of line. [anchors] Elsewhere in
    //	   the pattern, ^ and $ are treated as ordinary characters.
    // = END<INDENT>
    //
    //	   [12] \b \n \f \r \t
    // = BEGIN<INDENT>
    //	   these are used in a regular expression to denote the special
    //	   characters backspace, newline, form feed, carriage return and tab.
    // = END<INDENT>
    //
    // = SEE ALSO
    //	   <ex(1)>, <OTC_Pattern>
{
  public:

#if defined(ENV_OSTORE) && !defined(SCHEMA_GENERATION)
    static os_typespec* typespec();
    static os_typespec* get_os_typespec() { return typespec(); }
#endif

			~OTC_Regex();

    // = INITIALISATION

			OTC_Regex(char const* thePattern);
				// Compiles <thePattern>.

  protected:

    // = PATTERN COMPILATION

    void		compile()
				{ myValid = re_comp(pattern()); }
				// Compiles the pattern.

  public:

    // = ERRORS

    OTC_Boolean		isValid() const;
				// Returns <OTCLIB_TRUE> if the pattern is
				// valid.

    // = PATTERN MATCHING

    OTC_Boolean		match(char const* theString);
				// Returns <OTCLIB_TRUE> if <theString> matches
				// the most recently compiled pattern.
				// If no pattern had been compiled then
				// <OTCLIB_FALSE> is returned.

    // The following functions return information about the area of
    // a string which a pattern matched with. A value of <0> for <theIndex>
    // indicates matches with the complete pattern. A value greater than
    // zero indicates matches with subpatterns as designated by the
    // <\(\)> notation.
    //
    // All functions return <0> if no match has occurred.

    u_int		start() const;
				// Returns the index into the string where the
				// matched portion started.

    u_int		start(u_int theIndex) const;
				// Returns the index into the string where
				// the match began for the tagged portion
				// indicated by <theIndex>.

    u_int		length() const;
				// Returns the length of the matched portion.

    u_int		length(u_int theIndex) const;
				// Returns the length of the match for the
				// tagged portion indicated by <theIndex>.

    OTC_Range		range() const
				{ return OTC_Pattern::range(); }
				// Returns a range object for the matched
				// portion of the string.

    OTC_Range		range(u_int theIndex) const;
				// Returns a range object for the matched
				// tagged portion of the string indicated by
				// <theIndex>.

    // = CUSTOMISATION

    void		modw(char const* theString)
				{ re_modw(theString); }
				// The characters in the null terminated
				// string <theString> are added to the set of
				// characters identified as being in a word.
				// If <theString> is null or is zero length,
				// then the set of characters is reset to
				// the default.

    // = ERRORS

    char const*		error() const
				{ return myErrMesg; }
				// Returns a error message describing why the
				// pattern compilation failed.

  protected:

    OTC_Boolean		re_comp(char const* pat);
				// Performs the actual compilation
				// of a pattern.

    int			re_exec(char const* lp);
				// Performs the actual matching of
				// a pattern against a string.

    void		re_modw(char const* s);
				// Performs the actual work of adding
				// characters to set the set of characters
				// identified as being in a word.

    int			re_subs(char const* src, char* dst);

    virtual void	re_fail(char const* msg, char op);
				// Invoked when an internal error occurs
				// <re_exec()>. <msg> points to a description
				// of the error, and <op> indicates where in
				// the automaton the error occurred.

  private:

			OTC_Regex(OTC_Regex const&);

    OTC_Regex&		operator=(OTC_Regex const&);

    int*		tagstk;
				// Sub-pattern tag stack.

    u_char*		dfa;
				// Automaton.

    OTC_Boolean		sta;
				// <OTCLIB_TRUE> if there is a pattern currently
				// compiled.

    u_char*		bittab;
				// Bit table for CCL.

    void		chset(u_char c);
				// Sets info in bit table.

    char const*		pmatch(char const* lp, u_char* ap);
				// Does the hard work of determining if
				// a pattern matches.

    static char const	defchrtyp[];
				// Default classification table for word
				// boundary operators. A copy of this is
				// placed in <chrtyp> when the class is
				// constructed.

    static u_char const	deftab[];
				// Not sure.

    char*		chrtyp;
				// Holds the classification table for word
				// boundary operators. This can be modified
				// by using <re_modw()>.

    char const*		bol;
				// Points to the start of the string
				// the pattern is being matched against.

    char const**	bopat;
				// Array of pointers in to the string,
				// indicating where matched patterns
				// commenced.

    char const**	eopat;
				// Array of pointers in to the string,
				// indicating where matched patterns
				// ended.

    char const*		myErrMesg;
				// Points to a message describing why a
				// pattern compilation failed.

    void		badpat(char const* theErrMesg);
				// Logs <theErrMesg> as being the
				// reason that a pattern compilation failed.
				// Also set <dfa[0]> to <END> indicating
				// that there is no pattern.

    OTC_Boolean		myValid;

  public:

    // = BUILTIN REGULAR EXPRESSIONS
    //	   Some commonly used regular expressions are provided in
    //     precompiled form. When threads are being used, the only
    //     safe thing to do here is get the string representing the
    //     pattern as if two threads using the same class instance
    //     to match strings will cause problems.

    static OTC_Regex&	whiteSpace();
				// Matches white space.

    static OTC_Regex&	optWhiteSpace();
				// Optionally matches white space.

    static OTC_Regex&	nonWhiteSpace();
				// Matches non-white space.

    static OTC_Regex&	alphabetic();
				// Matches alpha characters.

    static OTC_Regex&	lowerCase();
				// Matches lower case characters.

    static OTC_Regex&	upperCase();
				// Matches upper case characters.

    static OTC_Regex&	alphaNumeric();
				// Matches alphanumeric characters.

    static OTC_Regex&	identifier();
				// Matches an identifier.

    static OTC_Regex&	matchingQuotes();
				// Matches string delineated with double
				// quotes. Note that this does not ignore
				// a quote preceded by a slosh, instead
				// that quote will be seen as the terminating
				// quote.

  private:

    static OTC_NRMutex	_mutex;
				// Lock for threads.

    static OTC_Regex*	myWhiteSpace;
				// Matches white space.

    static OTC_Regex*	myOptWhiteSpace;
				// Optionally matches white space.

    static OTC_Regex*	myNonWhiteSpace;
				// Matches non-white space.

    static OTC_Regex*	myAlphabetic;
				// Matches alpha characters.

    static OTC_Regex*	myLowerCase;
				// Matches lower case characters.

    static OTC_Regex*	myUpperCase;
				// Matches upper case characters.

    static OTC_Regex*	myAlphaNumeric;
				// Matches alphanumeric characters.

    static OTC_Regex*	myIdentifier;
				// Matches an identifier.

    static OTC_Regex*	myMatchingQuotes;
				// Matches string delineated with double
				// quotes.
};

/* ------------------------------------------------------------------------- */

#endif /* OTC_TEXT_REGEX_HH */
