SystemOrganization addCategory: #'VB-Regex'! SystemOrganization addCategory: #'VB-Regex-Exceptions'! !String methodsFor: '*vb-regex' stamp: ''! allRegexMatches: rxString ^rxString asRegex matchesIn: self! ! !String methodsFor: '*vb-regex' stamp: ''! asRegex "Compile the receiver as a regex matcher. May raise RxParser>>syntaxErrorSignal or RxParser>>compilationErrorSignal. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^RxParser preferredMatcherClass for: (RxParser new parse: self)! ! !String methodsFor: '*vb-regex' stamp: ''! asRegexIgnoringCase "Compile the receiver as a regex matcher. May raise RxParser>>syntaxErrorSignal or RxParser>>compilationErrorSignal. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^RxParser preferredMatcherClass for: (RxParser new parse: self) ignoreCase: true! ! !String methodsFor: '*vb-regex' stamp: ''! copyWithRegex: rxString matchesReplacedWith: aString ^rxString asRegex copy: self replacingMatchesWith: aString! ! !String methodsFor: '*vb-regex' stamp: ''! copyWithRegex: rxString matchesTranslatedUsing: aBlock ^rxString asRegex copy: self translatingMatchesUsing: aBlock! ! !String methodsFor: '*vb-regex' stamp: ''! matchesRegex: regexString "Test if the receiver matches a regex. May raise RxParser>>regexErrorSignal or child signals. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^regexString asRegex matches: self! ! !String methodsFor: '*vb-regex' stamp: ''! matchesRegexIgnoringCase: regexString "Test if the receiver matches a regex. May raise RxParser>>regexErrorSignal or child signals. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^regexString asRegexIgnoringCase matches: self! ! !String methodsFor: '*vb-regex' stamp: ''! prefixMatchesRegex: regexString "Test if the receiver's prefix matches a regex. May raise RxParser class>>regexErrorSignal or child signals. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^regexString asRegex matchesPrefix: self! ! !String methodsFor: '*vb-regex' stamp: ''! prefixMatchesRegexIgnoringCase: regexString "Test if the receiver's prefix matches a regex. May raise RxParser class>>regexErrorSignal or child signals. This is a part of the Regular Expression Matcher package, (c) 1996, 1999 Vassili Bykov. Refer to `documentation' protocol of RxParser class for details." ^regexString asRegexIgnoringCase matchesPrefix: self! ! !String methodsFor: '*vb-regex' stamp: ''! regex: rxString matchesCollect: aBlock ^rxString asRegex matchesIn: self collect: aBlock! ! !String methodsFor: '*vb-regex' stamp: ''! regex: rxString matchesDo: aBlock ^rxString asRegex matchesIn: self do: aBlock! ! Object subclass: #RxCharSetParser instanceVariableNames: 'source lookahead elements' classVariableNames: '' poolDictionaries: '' category: 'VB-Regex'! !RxCharSetParser commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- I am a parser created to parse the insides of a character set ([...]) construct. I create and answer a collection of "elements", each being an instance of one of: RxsCharacter, RxsRange, or RxsPredicate. Instance Variables: source open on whatever is inside the square brackets we have to parse. lookahead The current lookahead character elements > Parsing result! !RxCharSetParser class methodsFor: 'instance creation' stamp: ''! on: aStream ^self new initialize: aStream! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! addChar: aChar elements add: (RxsCharacter with: aChar)! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! addRangeFrom: firstChar to: lastChar firstChar asInteger > lastChar asInteger ifTrue: [RxParser signalSyntaxException: ' bad character range']. elements add: (RxsRange from: firstChar to: lastChar)! ! !RxCharSetParser methodsFor: 'initialize-release' stamp: ''! initialize: aStream source := aStream. lookahead := aStream next. elements := OrderedCollection new! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! match: aCharacter aCharacter = lookahead ifFalse: [RxParser signalSyntaxException: 'unexpected character: ', (String with: lookahead)]. ^source atEnd ifTrue: [lookahead := nil] ifFalse: [lookahead := source next]! ! !RxCharSetParser methodsFor: 'accessing' stamp: ''! parse lookahead = $- ifTrue: [self addChar: $-. self match: $-]. [lookahead isNil] whileFalse: [self parseStep]. ^elements! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! parseCharOrRange | firstChar | firstChar := lookahead. self match: firstChar. lookahead = $- ifTrue: [self match: $-. lookahead isNil ifTrue: [^self addChar: firstChar; addChar: $-] ifFalse: [self addRangeFrom: firstChar to: lookahead. ^self match: lookahead]]. self addChar: firstChar! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! parseEscapeChar self match: $\. $- = lookahead ifTrue: [elements add: (RxsCharacter with: $-)] ifFalse: [elements add: (RxsPredicate forEscapedLetter: lookahead)]. self match: lookahead! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! parseNamedSet | name | self match: $[; match: $:. name := (String with: lookahead), (source upTo: $:). lookahead := source next. self match: $]. elements add: (RxsPredicate forNamedClass: name)! ! !RxCharSetParser methodsFor: 'parsing' stamp: ''! parseStep lookahead = $[ ifTrue: [source peek = $: ifTrue: [^self parseNamedSet] ifFalse: [^self parseCharOrRange]]. lookahead = $\ ifTrue: [^self parseEscapeChar]. lookahead = $- ifTrue: [RxParser signalSyntaxException: 'invalid range']. self parseCharOrRange! ! Object subclass: #RxMatchOptimizer instanceVariableNames: 'ignoreCase prefixes nonPrefixes conditions testBlock methodPredicates nonMethodPredicates predicates nonPredicates' classVariableNames: '' poolDictionaries: '' category: 'VB-Regex'! !RxMatchOptimizer commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- A match start optimizer, handy for searching a string. Takes a regex syntax tree and sets itself up so that prefix characters or matcher states that cannot start a match are later recognized with #canStartMatch:in: method. Used by RxMatcher, but can be used by other matchers (if implemented) as well.! !RxMatchOptimizer methodsFor: 'accessing' stamp: ''! canStartMatch: aCharacter in: aMatcher "Answer whether a match could commence at the given lookahead character, or in the current state of . True answered by this method does not mean a match will definitly occur, while false answered by this method *does* guarantee a match will never occur." aCharacter isNil ifTrue: [^true]. ^testBlock == nil or: [testBlock value: aCharacter value: aMatcher]! ! !RxMatchOptimizer methodsFor: 'accessing' stamp: 'avi 11/30/2003 13:28'! conditionTester "#any condition is filtered at the higher level; it cannot appear among the conditions here." | matchCondition | conditions isEmpty ifTrue: [^nil]. conditions size = 1 ifTrue: [matchCondition := conditions detect: [:ignored | true]. "Special case all of the possible conditions." #atBeginningOfLine = matchCondition ifTrue: [^[:c :matcher | matcher atBeginningOfLine]]. #atEndOfLine = matchCondition ifTrue: [^[:c :matcher | matcher atEndOfLine]]. #atBeginningOfWord = matchCondition ifTrue: [^[:c :matcher | matcher atBeginningOfWord]]. #atEndOfWord = matchCondition ifTrue: [^[:c :matcher | matcher atEndOfWord]]. #atWordBoundary = matchCondition ifTrue: [^[:c :matcher | matcher atWordBoundary]]. #notAtWordBoundary = matchCondition ifTrue: [^[:c :matcher | matcher notAtWordBoundary]]. RxParser signalCompilationException: 'invalid match condition']. "More than one condition. Capture them as an array in scope." matchCondition := conditions asArray. ^[:c :matcher | matchCondition anySatisfy: [:conditionSelector | matcher perform: conditionSelector]]! ! !RxMatchOptimizer methodsFor: 'private' stamp: 'avi 11/30/2003 13:27'! determineTestMethod "Answer a block closure that will work as a can-match predicate. Answer nil if no viable optimization is possible (too many chars would be able to start a match)." | testers | (conditions includes: #any) ifTrue: [^nil]. testers := OrderedCollection new: 5. #(#prefixTester #nonPrefixTester #conditionTester #methodPredicateTester #nonMethodPredicateTester #predicateTester #nonPredicateTester) do: [:selector | | tester | tester := self perform: selector. tester notNil ifTrue: [testers add: tester]]. testers isEmpty ifTrue: [^nil]. testers size = 1 ifTrue: [^testers first]. testers := testers asArray. ^[:char :matcher | testers anySatisfy: [:t | t value: char value: matcher]]! ! !RxMatchOptimizer methodsFor: 'initialize-release' stamp: ''! initialize: aRegex ignoreCase: aBoolean "Set `testMethod' variable to a can-match predicate block: two-argument block which accepts a lookahead character and a matcher (presumably built from aRegex) and answers a boolean indicating whether a match could start at the given lookahead. " ignoreCase := aBoolean. prefixes := Set new: 10. nonPrefixes := Set new: 10. conditions := Set new: 3. methodPredicates := Set new: 3. nonMethodPredicates := Set new: 3. predicates := Set new: 3. nonPredicates := Set new: 3. aRegex dispatchTo: self. "If the whole expression is nullable, end-of-line is an implicit can-match condition!!" aRegex isNullable ifTrue: [conditions add: #atEndOfLine]. testBlock := self determineTestMethod! ! !RxMatchOptimizer methodsFor: 'accessing' stamp: ''! methodPredicateTester | p selector | methodPredicates isEmpty ifTrue: [^nil]. p := self optimizeSet: methodPredicates. "also allows copying closures" ^p size = 1 ifTrue: ["might be a pretty common case" selector := p first. [:char :matcher | RxParser doHandlingMessageNotUnderstood: [char perform: selector]]] ifFalse: [[:char :m | RxParser doHandlingMessageNotUnderstood: [p contains: [:sel | char perform: sel]]]]! ! !RxMatchOptimizer methodsFor: 'accessing' stamp: ''! nonMethodPredicateTester | p selector | nonMethodPredicates isEmpty ifTrue: [^nil]. p := self optimizeSet: nonMethodPredicates. "also allows copying closures" ^p size = 1 ifTrue: [selector := p first. [:char :matcher | RxParser doHandlingMessageNotUnderstood: [(char perform: selector) not]]] ifFalse: [[:char :m | RxParser doHandlingMessageNotUnderstood: [p contains: [:sel | (char perform: sel) not]]]]! ! !RxMatchOptimizer methodsFor: 'private' stamp: ''! nonPredicateTester | p pred | nonPredicates isEmpty ifTrue: [^nil]. p := self optimizeSet: nonPredicates. "also allows copying closures" ^p size = 1 ifTrue: [pred := p first. [:char :matcher | (pred value: char) not]] ifFalse: [[:char :m | p contains: [:some | (some value: char) not]]]! ! !RxMatchOptimizer methodsFor: 'private' stamp: 'stephane.ducasse 4/13/2009 20:32'! nonPrefixTester | np nonPrefixChar | nonPrefixes isEmpty ifTrue: [^nil]. np := self optimizeSet: nonPrefixes. "also allows copying closures" ^np size = 1 "might be be pretty common case" ifTrue: [nonPrefixChar := np first. [:char :matcher | char ~= nonPrefixChar]] ifFalse: [[:char :matcher | (np includes: char) not]]! ! !RxMatchOptimizer methodsFor: 'private' stamp: ''! optimizeSet: aSet "If a set is small, convert it to array to speed up lookup (Array has no hashing overhead, beats Set on small number of elements)." ^aSet size < 10 ifTrue: [aSet asArray] ifFalse: [aSet]! ! !RxMatchOptimizer methodsFor: 'private' stamp: ''! predicateTester | p pred | predicates isEmpty ifTrue: [^nil]. p := self optimizeSet: predicates. "also allows copying closures" ^p size = 1 ifTrue: [pred := p first. [:char :matcher | pred value: char]] ifFalse: [[:char :m | p contains: [:some | some value: char]]]! ! !RxMatchOptimizer methodsFor: 'private' stamp: ''! prefixTester | p prefixChar | prefixes isEmpty ifTrue: [^nil]. p := self optimizeSet: prefixes. "also allows copying closures" ignoreCase ifTrue: [p := p collect: [:each | each asUppercase]]. ^p size = 1 "might be a pretty common case" ifTrue: [prefixChar := p first. ignoreCase ifTrue: [[:char :matcher | char sameAs: prefixChar]] ifFalse: [[:char :matcher | char = prefixChar]]] ifFalse: [ignoreCase ifTrue: [[:char :matcher | p includes: char asUppercase]] ifFalse: [[:char :matcher | p includes: char]]]! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxAny "Any special char is among the prefixes." conditions add: #any! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxBeginningOfLine "Beginning of line is among the prefixes." conditions add: #atBeginningOfLine! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxBeginningOfWord "Beginning of line is among the prefixes." conditions add: #atBeginningOfWord! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxBranch: branchNode "If the head piece of the branch is transparent (allows 0 matches), we must recurse down the branch. Otherwise, just the head atom is important." (branchNode piece isNullable and: [branchNode branch notNil]) ifTrue: [branchNode branch dispatchTo: self]. branchNode piece dispatchTo: self! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxCharSet: charSetNode "All these (or none of these) characters is the prefix." charSetNode isNegated ifTrue: [nonPrefixes addAll: charSetNode enumerableSet] ifFalse: [prefixes addAll: charSetNode enumerableSet]. charSetNode hasPredicates ifTrue: [charSetNode isNegated ifTrue: [nonPredicates addAll: charSetNode predicates] ifFalse: [predicates addAll: charSetNode predicates]]! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxCharacter: charNode "This character is the prefix, of one of them." prefixes add: charNode character! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxEndOfLine "Beginning of line is among the prefixes." conditions add: #atEndOfLine! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxEndOfWord conditions add: #atEndOfWord! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxEpsilon "Empty string, terminate the recursion (do nothing)."! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxMessagePredicate: messagePredicateNode messagePredicateNode negated ifTrue: [nonMethodPredicates add: messagePredicateNode selector] ifFalse: [methodPredicates add: messagePredicateNode selector]! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxNonWordBoundary conditions add: #notAtWordBoundary! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxPiece: pieceNode "Pass on to the atom." pieceNode atom dispatchTo: self! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxPredicate: predicateNode predicates add: predicateNode predicate! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxRegex: regexNode "All prefixes of the regex's branches should be combined. Therefore, just recurse." regexNode branch dispatchTo: self. regexNode regex notNil ifTrue: [regexNode regex dispatchTo: self]! ! !RxMatchOptimizer methodsFor: 'double dispatch' stamp: ''! syntaxWordBoundary conditions add: #atWordBoundary! ! Object subclass: #RxMatcher instanceVariableNames: 'matcher ignoreCase startOptimizer stream markerPositions markerCount lastResult lastChar' classVariableNames: 'Cr Lf' poolDictionaries: '' category: 'VB-Regex'! !RxMatcher commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- This is a recursive regex matcher. Not strikingly efficient, but simple. Also, keeps track of matched subexpressions. The life cycle goes as follows: 1. Initialization. Accepts a syntax tree (presumably produced by RxParser) and compiles it into a matcher built of other classes in this category. 2. Matching. Accepts a stream or a string and returns a boolean indicating whether the whole stream or its prefix -- depending on the message sent -- matches the regex. 3. Subexpression query. After a successful match, and before any other match, the matcher may be queried about the range of specific stream (string) positions that matched to certain parenthesized subexpressions of the original expression. Any number of queries may follow a successful match, and any number or matches may follow a successful initialization. Note that `matcher' is actually a sort of a misnomer. The actual matcher is a web of Rxm* instances built by RxMatcher during initialization. RxMatcher is just the interface facade of this network. It is also a builder of it, and also provides a stream-like protocol to easily access the stream being matched. Instance variables: matcher The entry point into the actual matcher. stream The stream currently being matched against. markerPositions Positions of markers' matches. markerCount Number of markers. lastResult Whether the latest match attempt succeeded or not. lastChar character last seen in the matcher stream! !RxMatcher class methodsFor: 'instance creation' stamp: ''! for: aRegex "Create and answer a matcher that will match a regular expression specified by the syntax tree of which `aRegex' is a root." ^self for: aRegex ignoreCase: false! ! !RxMatcher class methodsFor: 'instance creation' stamp: ''! for: aRegex ignoreCase: aBoolean "Create and answer a matcher that will match a regular expression specified by the syntax tree of which `aRegex' is a root." ^self new initialize: aRegex ignoreCase: aBoolean! ! !RxMatcher class methodsFor: 'instance creation' stamp: ''! forString: aString "Create and answer a matcher that will match the regular expression `aString'." ^self for: (RxParser new parse: aString)! ! !RxMatcher class methodsFor: 'instance creation' stamp: ''! forString: aString ignoreCase: aBoolean "Create and answer a matcher that will match the regular expression `aString'." ^self for: (RxParser new parse: aString) ignoreCase: aBoolean! ! !RxMatcher class methodsFor: 'class initialization' stamp: 'avi 11/30/2003 13:30'! initialize "RxMatcher initialize" Cr := Character cr. Lf := Character lf.! ! !RxMatcher methodsFor: 'private' stamp: ''! allocateMarker "Answer an integer to use as an index of the next marker." markerCount := markerCount + 1. ^markerCount! ! !RxMatcher methodsFor: 'testing' stamp: ''! atBeginningOfLine ^self position = 0 or: [lastChar = Cr]! ! !RxMatcher methodsFor: 'testing' stamp: ''! atBeginningOfWord ^(self isWordChar: lastChar) not and: [self isWordChar: stream peek]! ! !RxMatcher methodsFor: 'streaming' stamp: ''! atEnd ^stream atEnd! ! !RxMatcher methodsFor: 'testing' stamp: ''! atEndOfLine ^self atEnd or: [stream peek = Cr]! ! !RxMatcher methodsFor: 'testing' stamp: ''! atEndOfWord ^(self isWordChar: lastChar) and: [(self isWordChar: stream peek) not]! ! !RxMatcher methodsFor: 'testing' stamp: ''! atWordBoundary ^(self isWordChar: lastChar) xor: (self isWordChar: stream peek)! ! !RxMatcher methodsFor: 'accessing' stamp: ''! buildFrom: aSyntaxTreeRoot "Private - Entry point of matcher build process." markerCount := 0. "must go before #dispatchTo: !!" matcher := aSyntaxTreeRoot dispatchTo: self. matcher terminateWith: RxmTerminator new! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! copy: aString replacingMatchesWith: replacementString "Copy , except for the matches. Replace each match with ." | answer | answer := (String new: 40) writeStream. self copyStream: aString readStream to: answer replacingMatchesWith: replacementString. ^answer contents! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! copy: aString translatingMatchesUsing: aBlock "Copy , except for the matches. For each match, evaluate passing the matched substring as the argument. Expect the block to answer a String, and replace the match with the answer." | answer | answer := (String new: 40) writeStream. self copyStream: aString readStream to: answer translatingMatchesUsing: aBlock. ^answer contents! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! copyStream: aStream to: writeStream replacingMatchesWith: aString "Copy the contents of on the , except for the matches. Replace each match with ." | searchStart matchStart matchEnd | stream := aStream. lastChar := nil. [searchStart := aStream position. self proceedSearchingStream: aStream] whileTrue: [matchStart := self subBeginning: 1. matchEnd := self subEnd: 1. aStream position: searchStart. searchStart to: matchStart - 1 do: [:ignoredPos | writeStream nextPut: aStream next]. writeStream nextPutAll: aString. aStream position: matchEnd]. aStream position: searchStart. [aStream atEnd] whileFalse: [writeStream nextPut: aStream next]! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! copyStream: aStream to: writeStream translatingMatchesUsing: aBlock "Copy the contents of on the , except for the matches. For each match, evaluate passing the matched substring as the argument. Expect the block to answer a String, and write the answer to in place of the match." | searchStart matchStart matchEnd match | stream := aStream. lastChar := nil. [searchStart := aStream position. self proceedSearchingStream: aStream] whileTrue: [matchStart := self subBeginning: 1. matchEnd := self subEnd: 1. aStream position: searchStart. searchStart to: matchStart - 1 do: [:ignoredPos | writeStream nextPut: aStream next]. match := (String new: matchEnd - matchStart + 1) writeStream. matchStart to: matchEnd - 1 do: [:ignoredPos | match nextPut: aStream next]. writeStream nextPutAll: (aBlock value: match contents)]. aStream position: searchStart. [aStream atEnd] whileFalse: [writeStream nextPut: aStream next]! ! !RxMatcher methodsFor: 'privileged' stamp: ''! currentState "Answer an opaque object that can later be used to restore the matcher's state (for backtracking)." | origPosition origLastChar | origPosition := stream position. origLastChar := lastChar. ^ [stream position: origPosition. lastChar := origLastChar]! ! !RxMatcher methodsFor: 'private' stamp: ''! hookBranchOf: regexNode onto: endMarker "Private - Recurse down the chain of regexes starting at regexNode, compiling their branches and hooking their tails to the endMarker node." | rest | rest := regexNode regex isNil ifTrue: [nil] ifFalse: [self hookBranchOf: regexNode regex onto: endMarker]. ^RxmBranch new next: ((regexNode branch dispatchTo: self) pointTailTo: endMarker; yourself); alternative: rest; yourself! ! !RxMatcher methodsFor: 'initialize-release' stamp: ''! initialize: syntaxTreeRoot ignoreCase: aBoolean "Compile thyself for the regex with the specified syntax tree. See comment and `building' protocol in this class and #dispatchTo: methods in syntax tree components for details on double-dispatch building. The argument is supposedly a RxsRegex." ignoreCase := aBoolean. self buildFrom: syntaxTreeRoot. startOptimizer := RxMatchOptimizer new initialize: syntaxTreeRoot ignoreCase: aBoolean! ! !RxMatcher methodsFor: 'private' stamp: ''! isWordChar: aCharacterOrNil "Answer whether the argument is a word constituent character: alphanumeric or :=." ^aCharacterOrNil ~~ nil and: [aCharacterOrNil isAlphaNumeric]! ! !RxMatcher methodsFor: 'accessing' stamp: ''! lastResult ^lastResult! ! !RxMatcher methodsFor: 'private' stamp: ''! makeOptional: aMatcher "Private - Wrap this matcher so that the result would match 0 or 1 occurrences of the matcher." | dummy branch | dummy := RxmLink new. branch := (RxmBranch new beLoopback) next: aMatcher; alternative: dummy. aMatcher pointTailTo: dummy. ^branch! ! !RxMatcher methodsFor: 'private' stamp: ''! makePlus: aMatcher "Private - Wrap this matcher so that the result would match 1 and more occurrences of the matcher." | loopback | loopback := (RxmBranch new beLoopback) next: aMatcher. aMatcher pointTailTo: loopback. ^aMatcher! ! !RxMatcher methodsFor: 'private' stamp: ''! makeStar: aMatcher "Private - Wrap this matcher so that the result would match 0 and more occurrences of the matcher." | dummy detour loopback | dummy := RxmLink new. detour := RxmBranch new next: aMatcher; alternative: dummy. loopback := (RxmBranch new beLoopback) next: aMatcher; alternative: dummy. aMatcher pointTailTo: loopback. ^detour! ! !RxMatcher methodsFor: 'privileged' stamp: ''! markerPositionAt: anIndex ^markerPositions at: anIndex! ! !RxMatcher methodsFor: 'privileged' stamp: ''! markerPositionAt: anIndex maybePut: position "Set position of the given marker, if not already set." (markerPositions at: anIndex) == nil ifTrue: [markerPositions at: anIndex put: position]! ! !RxMatcher methodsFor: 'accessing' stamp: ''! matches: aString "Match against a string." ^self matchesStream: aString readStream! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! matchesIn: aString "Search aString repeatedly for the matches of the receiver. Answer an OrderedCollection of all matches (substrings)." | result | result := OrderedCollection new. self matchesOnStream: aString readStream do: [:match | result add: match]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: 'damien.pollet 5/2/2009 23:59'! matchesIn: aString collect: aBlock "Search aString repeatedly for the matches of the receiver. Evaluate aBlock for each match passing the matched substring as the argument, collect evaluation results in an OrderedCollection, and return it. The following example shows how to use this message to split a string into words." "'\w+' asRegex matchesIn: 'Now is the Time' collect: [:each | each asLowercase]" | result | result := OrderedCollection new. self matchesOnStream: aString readStream do: [:match | result add: (aBlock value: match)]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! matchesIn: aString do: aBlock "Search aString repeatedly for the matches of the receiver. Evaluate aBlock for each match passing the matched substring as the argument." self matchesOnStream: aString readStream do: aBlock! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! matchesOnStream: aStream | result | result := OrderedCollection new. self matchesOnStream: aStream do: [:match | result add: match]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! matchesOnStream: aStream collect: aBlock | result | result := OrderedCollection new. self matchesOnStream: aStream do: [:match | result add: (aBlock value: match)]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: ''! matchesOnStream: aStream do: aBlock [self searchStream: aStream] whileTrue: [aBlock value: (self subexpression: 1)]! ! !RxMatcher methodsFor: 'accessing' stamp: ''! matchesPrefix: aString "Match against a string." ^self matchesStreamPrefix: aString readStream! ! !RxMatcher methodsFor: 'accessing' stamp: ''! matchesStream: theStream "Match thyself against a positionable stream." ^(self matchesStreamPrefix: theStream) and: [stream atEnd]! ! !RxMatcher methodsFor: 'accessing' stamp: ''! matchesStreamPrefix: theStream "Match thyself against a positionable stream." stream := theStream. lastChar := nil. ^self tryMatch! ! !RxMatcher methodsFor: 'streaming' stamp: ''! next lastChar := stream next. ^lastChar! ! !RxMatcher methodsFor: 'testing' stamp: ''! notAtWordBoundary ^self atWordBoundary not! ! !RxMatcher methodsFor: 'streaming' stamp: ''! position ^stream position! ! !RxMatcher methodsFor: 'private' stamp: ''! proceedSearchingStream: aStream | position | position := aStream position. [aStream atEnd] whileFalse: [self tryMatch ifTrue: [^true]. aStream position: position. lastChar := aStream next. position := aStream position]. "Try match at the very stream end too!!" self tryMatch ifTrue: [^true]. ^false! ! !RxMatcher methodsFor: 'privileged' stamp: ''! restoreState: aBlock aBlock value! ! !RxMatcher methodsFor: 'accessing' stamp: ''! search: aString "Search the string for occurrence of something matching myself. Answer a Boolean indicating success." ^self searchStream: aString readStream! ! !RxMatcher methodsFor: 'accessing' stamp: ''! searchStream: aStream "Search the stream for occurrence of something matching myself. After the search has occurred, stop positioned after the end of the matched substring. Answer a Boolean indicating success." | position | stream := aStream. lastChar := nil. position := aStream position. [aStream atEnd] whileFalse: [self tryMatch ifTrue: [^true]. aStream position: position. lastChar := aStream next. position := aStream position]. "Try match at the very stream end too!!" self tryMatch ifTrue: [^true]. ^false! ! !RxMatcher methodsFor: 'accessing' stamp: ''! subBeginning: subIndex ^markerPositions at: subIndex * 2 - 1! ! !RxMatcher methodsFor: 'accessing' stamp: ''! subEnd: subIndex ^markerPositions at: subIndex * 2! ! !RxMatcher methodsFor: 'accessing' stamp: ''! subexpression: subIndex | originalPosition start end reply | originalPosition := stream position. start := self subBeginning: subIndex. end := self subEnd: subIndex. (start isNil or: [end isNil]) ifTrue: [^String new]. reply := (String new: end - start) writeStream. stream position: start. start to: end - 1 do: [:ignored | reply nextPut: stream next]. stream position: originalPosition. ^reply contents! ! !RxMatcher methodsFor: 'accessing' stamp: ''! subexpressionCount ^markerCount // 2! ! !RxMatcher methodsFor: 'accessing' stamp: 'damien.pollet 5/2/2009 23:51'! subexpressions | result | result := Array new: self subexpressionCount. 1 to: self subexpressionCount do: [:index | result at: index put: (self subexpression: index) ]. ^ result! ! !RxMatcher methodsFor: 'match enumeration' stamp: 'damien.pollet 5/3/2009 00:03'! submatchesIn: aString "Search aString repeatedly for the matches of the receiver. Answer an OrderedCollection with an array of subexpressions per match." | result | result := OrderedCollection new. self submatchesOnStream: aString readStream do: [:subexprs | result add: subexprs]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: 'damien.pollet 5/3/2009 00:04'! submatchesIn: aString collect: aBlock "Search aString repeatedly for the matches of the receiver. Evaluate aBlock for each match passing the collection of matched subexpressions as the argument, collecting evaluation results in an OrderedCollection." | result | result := OrderedCollection new. self submatchesOnStream: aString readStream do: [:subexprs | result add: (aBlock value: subexprs)]. ^result! ! !RxMatcher methodsFor: 'match enumeration' stamp: 'damien.pollet 5/3/2009 00:04'! submatchesIn: aString do: aBlock "Search aString repeatedly for the matches of the receiver. Evaluate aBlock for each match passing the collection of matched subexpressions as the argument." self submatchesOnStream: aString readStream do: aBlock! ! !RxMatcher methodsFor: 'match enumeration' stamp: 'damien.pollet 5/2/2009 23:55'! submatchesOnStream: aStream do: aBlock [self searchStream: aStream] whileTrue: [aBlock value: self subexpressions]! ! !RxMatcher methodsFor: 'testing' stamp: ''! supportsSubexpressions ^true! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxAny "Double dispatch from the syntax tree. Create a matcher for any non-whitespace character." ^RxmPredicate new predicate: [:char | (Cr = char or: [Lf = char]) not]! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxBeginningOfLine "Double dispatch from the syntax tree. Create a matcher for beginning-of-line condition." ^RxmSpecial new beBeginningOfLine! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxBeginningOfWord "Double dispatch from the syntax tree. Create a matcher for beginning-of-word condition." ^RxmSpecial new beBeginningOfWord! ! !RxMatcher methodsFor: 'double dispatch' stamp: 'PeterHugossonMiller 9/3/2009 11:08'! syntaxBranch: branchNode "Double dispatch from the syntax tree. Branch node is a link in a chain of concatenated pieces. First build the matcher for the rest of the chain, then make it for the current piece and hook the rest to it." | result next rest | branchNode branch isNil ifTrue: [^branchNode piece dispatchTo: self]. "Optimization: glue a sequence of individual characters into a single string to match." branchNode piece isAtomic ifTrue: [result := (String new: 40) writeStream. next := branchNode tryMergingInto: result. result := result contents. result size > 1 ifTrue: "worth merging" [rest := next notNil ifTrue: [next dispatchTo: self] ifFalse: [nil]. ^(RxmSubstring new substring: result ignoreCase: ignoreCase) pointTailTo: rest; yourself]]. "No optimization possible or worth it, just concatenate all. " ^(branchNode piece dispatchTo: self) pointTailTo: (branchNode branch dispatchTo: self); yourself! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxCharSet: charSetNode "Double dispatch from the syntax tree. A character set is a few characters, and we either match any of them, or match any that is not one of them." ^RxmPredicate with: charSetNode predicate! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxCharacter: charNode "Double dispatch from the syntax tree. We get here when no merging characters into strings was possible." | wanted | wanted := charNode character. ^RxmPredicate new predicate: (ignoreCase ifTrue: [[:char | char sameAs: wanted]] ifFalse: [[:char | char = wanted]])! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxEndOfLine "Double dispatch from the syntax tree. Create a matcher for end-of-line condition." ^RxmSpecial new beEndOfLine! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxEndOfWord "Double dispatch from the syntax tree. Create a matcher for end-of-word condition." ^RxmSpecial new beEndOfWord! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxEpsilon "Double dispatch from the syntax tree. Match empty string. This is unlikely to happen in sane expressions, so we'll live without special epsilon-nodes." ^RxmSubstring new substring: String new ignoreCase: ignoreCase! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxMessagePredicate: messagePredicateNode "Double dispatch from the syntax tree. Special link can handle predicates." ^messagePredicateNode negated ifTrue: [RxmPredicate new bePerformNot: messagePredicateNode selector] ifFalse: [RxmPredicate new bePerform: messagePredicateNode selector]! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxNonWordBoundary "Double dispatch from the syntax tree. Create a matcher for the word boundary condition." ^RxmSpecial new beNotWordBoundary! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxPiece: pieceNode "Double dispatch from the syntax tree. Piece is an atom repeated a few times. Take care of a special case when the atom is repeated just once." | atom | atom := pieceNode atom dispatchTo: self. ^pieceNode isSingular ifTrue: [atom] ifFalse: [pieceNode isStar ifTrue: [self makeStar: atom] ifFalse: [pieceNode isPlus ifTrue: [self makePlus: atom] ifFalse: [pieceNode isOptional ifTrue: [self makeOptional: atom] ifFalse: [RxParser signalCompilationException: 'repetitions are not supported by RxMatcher']]]]! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxPredicate: predicateNode "Double dispatch from the syntax tree. A character set is a few characters, and we either match any of them, or match any that is not one of them." ^RxmPredicate with: predicateNode predicate! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxRegex: regexNode "Double dispatch from the syntax tree. Regex node is a chain of branches to be tried. Should compile this into a bundle of parallel branches, between two marker nodes." | startIndex endIndex endNode alternatives | startIndex := self allocateMarker. endIndex := self allocateMarker. endNode := RxmMarker new index: endIndex. alternatives := self hookBranchOf: regexNode onto: endNode. ^(RxmMarker new index: startIndex) pointTailTo: alternatives; yourself! ! !RxMatcher methodsFor: 'double dispatch' stamp: ''! syntaxWordBoundary "Double dispatch from the syntax tree. Create a matcher for the word boundary condition." ^RxmSpecial new beWordBoundary! ! !RxMatcher methodsFor: 'private' stamp: ''! tryMatch "Match thyself against the current stream." markerPositions := Array new: markerCount. startOptimizer == nil ifTrue: [lastResult := matcher matchAgainst: self] ifFalse: [lastResult := (startOptimizer canStartMatch: stream peek in: self) and: [matcher matchAgainst: self]]. ^lastResult! ! Object subclass: #RxParser instanceVariableNames: 'input lookahead' classVariableNames: 'BackslashConstants BackslashSpecials ExceptionObjects' poolDictionaries: '' category: 'VB-Regex'! !RxParser commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- The regular expression parser. Translates a regular expression read from a stream into a parse tree. ('accessing' protocol). The tree can later be passed to a matcher initialization method. All other classes in this category implement the tree. Refer to their comments for any details. Instance variables: input A stream with the regular expression being parsed. lookahead ! !RxParser class methodsFor: 'DOCUMENTATION' stamp: ''! a:x introduction:xx " A regular expression is a template specifying a class of strings. A regular expression matcher is an tool that determines whether a string belongs to a class specified by a regular expression. This is a common task of a user input validation code, and the use of regular expressions can GREATLY simplify and speed up development of such code. As an example, here is how to verify that a string is a valid hexadecimal number in Smalltalk notation, using this matcher package: aString matchesRegex: '16r[[:xdigit:]]+' (Coding the same ``the hard way'' is an exercise to a curious reader). This matcher is offered to the Smalltalk community in hope it will be useful. It is free in terms of money, and to a large extent--in terms of rights of use. Refer to `Boring Stuff' section for legalese. The 'What's new in this release' section describes the functionality introduced in 1.1 release. The `Syntax' section explains the recognized syntax of regular expressions. The `Usage' section explains matcher capabilities that go beyond what String>>matchesRegex: method offers. The `Implementation notes' sections says a few words about what is under the hood. Happy hacking, --Vassili Bykov August 6, 1996 April 4, 1999 " self error: 'comment only'! ! !RxParser class methodsFor: 'DOCUMENTATION' stamp: ''! b:x whatsNewInThisRelease: xx " VERSION 1.1 (October 1999) Regular expression syntax corrections and enhancements: 1. Backslash escapes similar to those in Perl are allowed in patterns: \w any word constituent character (equivalent to [a-zA-Z0-9:=]) \W any character but a word constituent (equivalent to [^a-xA-Z0-9:=] \d a digit (same as [0-9]) \D anything but a digit \s a whitespace character \S anything but a whitespace character \b an empty string at a word boundary \B an empty string not at a word boundary \< an empty string at the beginning of a word \> an empty string at the end of a word For example, '\w+' is now a valid expression matching any word. 2. The following backslash escapes are also allowed in character sets (between square brackets): \w, \W, \d, \D, \s, and \S. 3. The following grep(1)-compatible named character classes are recognized in character sets as well: [:alnum:] [:alpha:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:] For example, the following patterns are equivalent: '[[:alnum:]]+' '\w+' '[\w]+' '[a-zA-Z0-9:=]+' 4. Some non-printable characters can be represented in regular expressions using a common backslash notation: \t tab (Character tab) \n newline (Character lf) \r carriage return (Character cr) \f form feed (Character newPage) \e escape (Character esc) 5. A dot is corectly interpreted as 'any character but a newline' instead of 'anything but whitespace'. 6. Case-insensitive matching. The easiest access to it are new messages CharacterArray understands: #asRegexIgnoringCase, #matchesRegexIgnoringCase:, #prefixMatchesRegexIgnoringCase:. 7. The matcher (an instance of RxMatcher, the result of String>>asRegex) now provides a collection-like interface to matches in a particular string or on a particular stream, as well as substitution protocol. The interface includes the following messages: matchesIn: aString matchesIn: aString collect: aBlock matchesIn: aString do: aBlock matchesOnStream: aStream matchesOnStream: aStream collect: aBlock matchesOnStream: aStream do: aBlock copy: aString translatingMatchesUsing: aBlock copy: aString replacingMatchesWith: replacementString copyStream: aStream to: writeStream translatingMatchesUsing: aBlock copyStream: aStream to: writeStream replacingMatchesWith: aString Examples: '\w+' asRegex matchesIn: 'now is the time' returns an OrderedCollection containing four strings: 'now', 'is', 'the', and 'time'. '\= 32. [:lower:] any lowercase character [:print:] any printable character. In this version, this is the same as [:cntrl:] [:punct:] any punctuation character. [:space:] any whitespace character. [:upper:] any uppercase character. [:xdigit:] any hexadecimal character. Note that these elements are components of the character classes, i.e. they have to be enclosed in an extra set of square brackets to form a valid regular expression. For example, a non-empty string of digits would be represented as '[[:digit:]]+'. The above primitive expressions and operators are common to many implementations of regular expressions. The next primitive expression is unique to this Smalltalk implementation. A sequence of characters between colons is treated as a unary selector which is supposed to be understood by Characters. A character matches such an expression if it answers true to a message with that selector. This allows a more readable and efficient way of specifying character classes. For example, `[0-9]' is equivalent to `:isDigit:', but the latter is more efficient. Analogously to character sets, character classes can be negated: `:^isDigit:' matches a Character that answers false to #isDigit, and is therefore equivalent to `[^0-9]'. As an example, so far we have seen the following equivalent ways to write a regular expression that matches a non-empty string of digits: '[0-9]+' '\d+' '[\d]+' '[[:digit::]+' :isDigit:+' The last group of special primitive expressions includes: . matching any character except a newline; ^ matching an empty string at the beginning of a line; $ matching an empty string at the end of a line. \b an empty string at a word boundary \B an empty string not at a word boundary \< an empty string at the beginning of a word \> an empty string at the end of a word 'axyzb' matchesRegex: 'a.+b' -- true 'ax zb' matchesRegex: 'a.+b' -- false (space is not matched by `.') Again, all the above three characters are special and should be quoted to be matched literally. EXAMPLES As the introductions said, a great use for regular expressions is user input validation. Following are a few examples of regular expressions that might be handy in checking input entered by the user in an input field. Try them out by entering something between the quotes and print-iting. (Also, try to imagine Smalltalk code that each validation would require if coded by hand). Most example expressions could have been written in alternative ways. Checking if aString may represent a nonnegative integer number: '' matchesRegex: ':isDigit:+' or '' matchesRegex: '[0-9]+' or '' matchesRegex: '\d+' Checking if aString may represent an integer number with an optional sign in front: '' matchesRegex: '(\+|-)?\d+' Checking if aString is a fixed-point number, with at least one digit is required after a dot: '' matchesRegex: '(\+|-)?\d+(\.\d+)?' The same, but allow notation like `123.': '' matchesRegex: '(\+|-)?\d+(\.\d*)?' Recognizer for a string that might be a name: one word with first capital letter, no blanks, no digits. More traditional: '' matchesRegex: '[A-Z][A-Za-z]*' more Smalltalkish: '' matchesRegex: ':isUppercase::isAlphabetic:*' A date in format MMM DD, YYYY with any number of spaces in between, in XX century: '' matchesRegex: '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]+(\d\d?)[ ]*,[ ]*19(\d\d)' Note parentheses around some components of the expression above. As `Usage' section shows, they will allow us to obtain the actual strings that have matched them (i.e. month name, day number, and year number). For dessert, coming back to numbers: here is a recognizer for a general number format: anything like 999, or 999.999, or -999.999e+21. '' matchesRegex: '(\+|-)?\d+(\.\d*)?((e|E)(\+|-)?\d+)?' " self error: 'comment only'! ! !RxParser class methodsFor: 'test suite' stamp: ''! compileRegex: regexSource into: matcherClass "Compile the regex and answer the matcher, or answer nil if compilation fails." | syntaxTree | syntaxTree := self safelyParse: regexSource. syntaxTree == nil ifTrue: [^nil]. ^matcherClass for: syntaxTree! ! !RxParser class methodsFor: 'DOCUMENTATION' stamp: 'sd 1/14/2008 10:22'! d:x usage:xx " The preceding section covered the syntax of regular expressions. It used the simplest possible interface to the matcher: sending #matchesRegex: message to the sample string, with regular expression string as the argument. This section explains hairier ways of using the matcher. PREFIX MATCHING AND CASE-INSENSITIVE MATCHING A CharacterArray (an EsString in VA) also understands these messages: #prefixMatchesRegex: regexString #matchesRegexIgnoringCase: regexString #prefixMatchesRegexIgnoringCase: regexString #prefixMatchesRegex: is just like #matchesRegex, except that the whole receiver is not expected to match the regular expression passed as the argument; matching just a prefix of it is enough. For example: 'abcde' matchesRegex: '(a|b)+' -- false 'abcde' prefixMatchesRegex: '(a|b)+' -- true The last two messages are case-insensitive versions of matching. ENUMERATION INTERFACE An application can be interested in all matches of a certain regular expression within a String. The matches are accessible using a protocol modelled after the familiar Collection-like enumeration protocol: #regex: regexString matchesDo: aBlock Evaluates a one-argument for every match of the regular expression within the receiver string. #regex: regexString matchesCollect: aBlock Evaluates a one-argument for every match of the regular expression within the receiver string. Collects results of evaluations and anwers them as a SequenceableCollection. #allRegexMatches: regexString Returns a collection of all matches (substrings of the receiver string) of the regular expression. It is an equivalent of . REPLACEMENT AND TRANSLATION It is possible to replace all matches of a regular expression with a certain string using the message: #copyWithRegex: regexString matchesReplacedWith: aString For example: 'ab cd ab' copyWithRegex: '(a|b)+' matchesReplacedWith: 'foo' A more general substitution is match translation: #copyWithRegex: regexString matchesTranslatedUsing: aBlock This message evaluates a block passing it each match of the regular expression in the receiver string and answers a copy of the receiver with the block results spliced into it in place of the respective matches. For example: 'ab cd ab' copyWithRegex: '(a|b)+' matchesTranslatedUsing: [:each | each asUppercase] All messages of enumeration and replacement protocols perform a case-sensitive match. Case-insensitive versions are not provided as part of a CharacterArray protocol. Instead, they are accessible using the lower-level matching interface. LOWER-LEVEL INTERFACE Internally, #matchesRegex: works as follows: 1. A fresh instance of RxParser is created, and the regular expression string is passed to it, yielding the expression's syntax tree. 2. The syntax tree is passed as an initialization parameter to an instance of RxMatcher. The instance sets up some data structure that will work as a recognizer for the regular expression described by the tree. 3. The original string is passed to the matcher, and the matcher checks for a match. THE MATCHER If you repeatedly match a number of strings against the same regular expression using one of the messages defined in CharacterArray, the regular expression string is parsed and a matcher is created anew for every match. You can avoid this overhead by building a matcher for the regular expression, and then reusing the matcher over and over again. You can, for example, create a matcher at a class or instance initialization stage, and store it in a variable for future use. You can create a matcher using one of the following methods: - Sending #forString:ignoreCase: message to RxMatcher class, with the regular expression string and a Boolean indicating whether case is ignored as arguments. - Sending #forString: message. It is equivalent to <... forString: regexString ignoreCase: false>. A more convenient way is using one of the two matcher-created messages understood by CharacterArray. - is equivalent to . - is equivalent to . Here are four examples of creating a matcher: hexRecognizer := RxMatcher forString: '16r[0-9A-Fa-f]+' hexRecognizer := RxMatcher forString: '16r[0-9A-Fa-f]+' ignoreCase: false hexRecognizer := '16r[0-9A-Fa-f]+' asRegex hexRecognizer := '16r[0-9A-F]+' asRegexIgnoringCase MATCHING The matcher understands these messages (all of them return true to indicate successful match or search, and false otherwise): matches: aString True if the whole target string (aString) matches. matchesPrefix: aString True if some prefix of the string (not necessarily the whole string) matches. search: aString Search the string for the first occurrence of a matching substring. (Note that the first two methods only try matching from the very beginning of the string). Using the above example with a matcher for `a+', this method would answer success given a string `baaa', while the previous two would fail. matchesStream: aStream matchesStreamPrefix: aStream searchStream: aStream Respective analogs of the first three methods, taking input from a stream instead of a string. The stream must be positionable and peekable. All these methods answer a boolean indicating success. The matcher also stores the outcome of the last match attempt and can report it: lastResult Answers a Boolean -- the outcome of the most recent match attempt. If no matches were attempted, the answer is unspecified. SUBEXPRESSION MATCHES After a successful match attempt, you can query the specifics of which part of the original string has matched which part of the whole expression. A subexpression is a parenthesized part of a regular expression, or the whole expression. When a regular expression is compiled, its subexpressions are assigned indices starting from 1, depth-first, left-to-right. For example, `((ab)+(c|d))?ef' includes the following subexpressions with these indices: 1: ((ab)+(c|d))?ef 2: (ab)+(c|d) 3: ab 4: c|d After a successful match, the matcher can report what part of the original string matched what subexpression. It understandards these messages: subexpressionCount Answers the total number of subexpressions: the highest value that can be used as a subexpression index with this matcher. This value is available immediately after initialization and never changes. subexpression: anIndex An index must be a valid subexpression index, and this message must be sent only after a successful match attempt. The method answers a substring of the original string the corresponding subexpression has matched to. subBeginning: anIndex subEnd: anIndex Answer positions within the original string or stream where the match of a subexpression with the given index has started and ended, respectively. This facility provides a convenient way of extracting parts of input strings of complex format. For example, the following piece of code uses the 'MMM DD, YYYY' date format recognizer example from the `Syntax' section to convert a date to a three-element array with year, month, and day strings (you can select and evaluate it right here): | matcher | matcher := RxMatcher forString: '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]+(:isDigit::isDigit:?)[ ]*,[ ]*19(:isDigit::isDigit:)'. (matcher matches: 'Aug 6, 1996') ifTrue: [Array with: (matcher subexpression: 4) with: (matcher subexpression: 2) with: (matcher subexpression: 3)] ifFalse: ['no match'] (should answer ` #('96' 'Aug' '6')'). ENUMERATION AND REPLACEMENT The enumeration and replacement protocols exposed in CharacterArray are actually implemented by the mather. The following messages are understood: #matchesIn: aString #matchesIn: aString do: aBlock #matchesIn: aString collect: aBlock #copy: aString replacingMatchesWith: replacementString #copy: aString translatingMatchesUsing: aBlock #matchesOnStream: aStream #matchesOnStream: aStream do: aBlock #matchesOnStream: aStream collect: aBlock #copy: sourceStream to: targetStream replacingMatchesWith: replacementString #copy: sourceStream to: targetStream translatingMatchesWith: aBlock ERROR HANDLING Exception signaling objects (Signals in VisualWorks, Exceptions in VisualAge) are accessible through RxParser class protocol. To handle possible errors, use the protocol described below to obtain the exception objects and use the protocol of the native Smalltalk implementation to handle them. If a syntax error is detected while parsing expression, RxParser>>syntaxErrorSignal is raised/signaled. If an error is detected while building a matcher, RxParser>>compilationErrorSignal is raised/signaled. If an error is detected while matching (for example, if a bad selector was specified using `::' syntax, or because of the matcher's internal error), RxParser>>matchErrorSignal is raised RxParser>>regexErrorSignal is the parent of all three. Since any of the three signals can be raised within a call to #matchesRegex:, it is handy if you want to catch them all. For example: VisualWorks: RxParser regexErrorSignal handle: [:ex | ex returnWith: nil] do: ['abc' matchesRegex: '))garbage['] VisualAge: ['abc' matchesRegex: '))garbage['] when: RxParser regexErrorSignal do: [:signal | signal exitWith: nil] " self error: 'comment only'! ! !RxParser class methodsFor: 'exception signaling' stamp: 'avi 11/30/2003 13:24'! doHandlingMessageNotUnderstood: aBlock "MNU should be trapped and resignaled as a match error in a few places in the matcher. This method factors out this dialect-dependent code to make porting easier." ^ aBlock on: MessageNotUnderstood do: [:ex | RxMatcher signalMatchException: 'invalid predicate selector']! ! !RxParser class methodsFor: 'DOCUMENTATION' stamp: ''! e:x implementationNotes:xx " Version: 1.1 Released: October 1999 Mail to: Vassili Bykov , Flames to: /dev/null WHAT IS ADDED The matcher includes classes in two categories: VB-Regex-Syntax VB-Regex-Matcher and a few CharacterArray methods in `VB-regex' protocol. No system classes or methods are modified. WHAT TO LOOK AT FIRST String>>matchesRegex: -- in 90% cases this method is all you need to access the package. RxParser -- accepts a string or a stream of characters with a regular expression, and produces a syntax tree corresponding to the expression. The tree is made of instances of Rxs classes. RxMatcher -- accepts a syntax tree of a regular expression built by the parser and compiles it into a matcher: a structure made of instances of Rxm classes. The RxMatcher instance can test whether a string or a positionable stream of characters matches the original regular expression, or search a string or a stream for substrings matching the expression. After a match is found, the matcher can report a specific string that matched the whole expression, or any parenthesized subexpression of it. All other classes support the above functionality and are used by RxParser, RxMatcher, or both. CAVEATS The matcher is similar in spirit, but NOT in the design--let alone the code--to the original Henry Spencer's regular expression implementation in C. The focus is on simplicity, not on efficiency. I didn't optimize or profile anything. I may in future--or I may not: I do this in my spare time and I don't promise anything. The matcher passes H. Spencer's test suite (see 'test suite' protocol), with quite a few extra tests added, so chances are good there are not too many bugs. But watch out anyway. EXTENSIONS, FUTURE, ETC. With the existing separation between the parser, the syntax tree, and the matcher, it is easy to extend the system with other matchers based on other algorithms. In fact, I have a DFA-based matcher right now, but I don't feel it is good enough to include it here. I might add automata-based matchers later, but again I don't promise anything. HOW TO REACH ME As of today (October 3, 1999), you can contact me at . If this doesn't work, look around comp.lang.smalltalk and comp.lang.lisp. " self error: 'comment only'! ! !RxParser class methodsFor: 'DOCUMENTATION' stamp: ''! f:x boringStuff: xx " The Regular Expression Matcher (``The Software'') is Copyright (C) 1996, 1999 Vassili Bykov. It is provided to the Smalltalk community in hope it will be useful. 1. This license applies to the package as a whole, as well as to any component of it. By performing any of the activities described below, you accept the terms of this agreement. 2. The software is provided free of charge, and ``as is'', in hope that it will be useful, with ABSOLUTELY NO WARRANTY. The entire risk and all responsibility for the use of the software is with you. Under no circumstances the author may be held responsible for loss of data, loss of profit, or any other damage resulting directly or indirectly from the use of the software, even if the damage is caused by defects in the software. 3. You may use this software in any applications you build. 4. You may distribute this software provided that the software documentation and copyright notices are included and intact. 5. You may create and distribute modified versions of the software, such as ports to other Smalltalk dialects or derived work, provided that: a. any modified version is expressly marked as such and is not misrepresented as the original software; b. credit is given to the original software in the source code and documentation of the derived work; c. the copyright notice at the top of this document accompanies copyright notices of any modified version. " self error: 'comment only'! ! !RxParser class methodsFor: 'class initialization' stamp: 'avi 11/30/2003 13:26'! initialize "self initialize" self initializeBackslashConstants; initializeBackslashSpecials! ! !RxParser class methodsFor: 'class initialization' stamp: 'avi 11/30/2003 13:27'! initializeBackslashConstants "self initializeBackslashConstants" (BackslashConstants := Dictionary new) at: $e put: Character escape; at: $n put: Character lf; at: $r put: Character cr; at: $f put: Character newPage; at: $t put: Character tab! ! !RxParser class methodsFor: 'class initialization' stamp: ''! initializeBackslashSpecials "Keys are characters that normally follow a \, the values are associations of classes and initialization selectors on the instance side of the classes." "self initializeBackslashSpecials" (BackslashSpecials := Dictionary new) at: $w put: (Association key: RxsPredicate value: #beWordConstituent); at: $W put: (Association key: RxsPredicate value: #beNotWordConstituent); at: $s put: (Association key: RxsPredicate value: #beSpace); at: $S put: (Association key: RxsPredicate value: #beNotSpace); at: $d put: (Association key: RxsPredicate value: #beDigit); at: $D put: (Association key: RxsPredicate value: #beNotDigit); at: $b put: (Association key: RxsContextCondition value: #beWordBoundary); at: $B put: (Association key: RxsContextCondition value: #beNonWordBoundary); at: $< put: (Association key: RxsContextCondition value: #beBeginningOfWord); at: $> put: (Association key: RxsContextCondition value: #beEndOfWord)! ! !RxParser class methodsFor: 'utilities' stamp: ''! parse: aString "Parse the argument and return the result (the parse tree). In case of a syntax error, the corresponding exception is signaled." ^self new parse: aString! ! !RxParser class methodsFor: 'preferences' stamp: ''! preferredMatcherClass "The matcher to use. For now just one is available, but in principle this determines the matchers built implicitly, such as by String>>asRegex, or String>>matchesRegex:. This might seem a bit strange place for this preference, but Parser is still more or less `central' thing in the whole package." ^RxMatcher! ! !RxParser class methodsFor: 'test suite' stamp: ''! runProtocolTestsForMatcher: matcherClass | matcher | Transcript show: 'Testing matcher protocol...'. matcher := matcherClass forString: '\w+'. (matcher matchesIn: 'now is the time') asArray = #('now' 'is' 'the' 'time') ifFalse: [self error: 'matchesIn: test failed']. (matcher copy: 'now is the time ' translatingMatchesUsing: [:s | s reverse]) = 'won si eht emit ' ifFalse: [self error: 'copy:translatingMatchesWith: test failed']. "See that the match context is preserved while copying stuff between matches:" ((matcherClass forString: '\<\d\D+') copy: '9aaa1bbb 8ccc' replacingMatchesWith: 'foo') = 'foo1bbb foo' ifFalse: [self error: 'test failed']. Transcript show: 'OK'; cr! ! !RxParser class methodsFor: 'test suite' stamp: ''! runRegexTestsForMatcher: matcherClass "Run the whole suite of tests for the given matcher class. May blow up if anything goes wrong with the matcher or parser. Since this is a developer's tool, who cares?" "self runRegexTestsForMatcher: RxMatcher" | failures | failures := 0. Transcript cr. self testSuite do: [:clause | | rxSource matcher isOK | rxSource := clause first. Transcript show: 'Testing regex: '; show: rxSource printString; cr. matcher := self compileRegex: rxSource into: matcherClass. matcher == nil ifTrue: [(clause at: 2) isNil ifTrue: [Transcript tab; show: 'Compilation error as expected (ok)'; cr] ifFalse: [Transcript tab; show: 'Compilation error, UNEXPECTED -- FAILED'; cr. failures := failures + 1]] ifFalse: [(clause at: 2) == nil ifTrue: [Transcript tab; show: 'Compilation succeeded, should have failed -- FAILED!!'; cr. failures := failures + 1] ifFalse: [2 to: clause size by: 3 do: [:i | isOK := self test: matcher with: (clause at: i) expect: (clause at: i + 1) withSubexpressions: (clause at: i + 2). isOK ifFalse: [failures := failures + 1]. Transcript show: (isOK ifTrue: [' (ok).'] ifFalse: [' -- FAILED!!']); cr]]]]. failures = 0 ifTrue: [Transcript show: 'PASSED ALL TESTS.'; cr] ifFalse: [Transcript show: failures printString, ' TESTS FAILED!!'; cr]! ! !RxParser class methodsFor: 'test suite' stamp: ''! runTestsForMatcher: matcherClass "Run the whole suite of tests for the given matcher class. May blow up if something goes wrong with the matcher or the parser. Since this is a developer's tool, who cares?" "self runTestsForMatcher: RxMatcher" self runRegexTestsForMatcher: matcherClass; runProtocolTestsForMatcher: matcherClass! ! !RxParser class methodsFor: 'utilities' stamp: 'avi 11/30/2003 13:23'! safelyParse: aString "Parse the argument and return the result (the parse tree). In case of a syntax error, return nil. Exception handling here is dialect-dependent." ^ [self new parse: aString] on: RegexSyntaxError do: [:ex | nil]! ! !RxParser class methodsFor: 'exception signaling' stamp: 'avi 11/30/2003 13:25'! signalCompilationException: errorString RegexCompilationError new signal: errorString! ! !RxParser class methodsFor: 'exception signaling' stamp: 'avi 11/30/2003 13:25'! signalMatchException: errorString RegexMatchingError new signal: errorString! ! !RxParser class methodsFor: 'exception signaling' stamp: 'avi 11/30/2003 13:25'! signalSyntaxException: errorString RegexSyntaxError new signal: errorString! ! !RxParser class methodsFor: 'test suite' stamp: 'stephane.ducasse 4/13/2009 20:32'! test: aMatcher with: testString expect: expected withSubexpressions: subexpr | got | Transcript tab; show: 'Matching: '; show: testString printString; show: ' expected: '; show: expected printString; show: ' got: '. got := aMatcher search: testString. Transcript show: got printString. got asString ~= expected asString ifTrue: [^false]. (subexpr ~= nil and: [aMatcher supportsSubexpressions]) ifFalse: [^true] ifTrue: [ | isOK | isOK := true. 1 to: subexpr size by: 2 do: [:i | | sub subExpect subGot | sub := subexpr at: i. subExpect := subexpr at: i + 1. subGot := aMatcher subexpression: sub. Transcript cr; tab; tab; show: 'Subexpression: ', sub printString; show: ' expected: '; show: subExpect printString; show: ' got: '; show: subGot printString. subExpect ~= subGot ifTrue: [Transcript show: ' -- MISMATCH'. isOK := false]]. ^isOK]! ! !RxParser class methodsFor: 'test suite' stamp: ''! testSuite "Answer an array of test clauses. Each clause is an array with a regex source string followed by sequence of 3-tuples. Each three-element group is one test to try against the regex, and includes: 1) test string; 2) expected result; 3) expected subexpression as an array of (index, substring), or nil. The test suite is based on the one in Henry Spencer's regexp.c package." ^#( ('abc' 'abc' true (1 'abc') 'xbc' false nil 'axc' false nil 'abx' false nil 'xabcy' true (1 'abc') 'ababc' true (1 'abc')) ('ab*c' 'abc' true (1 'abc')) ('ab*bc' 'abc' true (1 'abc') 'abbc' true (1 'abbc') 'abbbbc' true (1 'abbbbc')) ('ab+bc' 'abbc' true (1 'abbc') 'abc' false nil 'abq' false nil 'abbbbc' true (1 'abbbbc')) ('ab?bc' 'abbc' true (1 'abbc') 'abc' true (1 'abc') 'abbbbc' false nil 'abc' true (1 'abc')) ('^abc$' 'abc' true (1 'abc') 'abcc' false nil 'aabc' false nil) ('^abc' 'abcc' true (1 'abc')) ('abc$' 'aabc' true (1 'abc')) ('^' 'abc' true nil) ('$' 'abc' true nil) ('a.c' 'abc' true (1 'abc') 'axc' true (1 'axc')) ('a.*c' 'axyzc' true (1 'axyzc') 'axy zc' true (1 'axy zc') "testing that a dot matches a space" 'axy zc' false nil "testing that a dot does not match a newline" 'axyzd' false nil) ('.a.*' '1234abc' true (1 '4abc') 'abcd' false nil) ('a\w+c' ' abbbbc ' true (1 'abbbbc') 'abb bc' false nil) ('\w+' ' foobar quux' true (1 'foobar') ' ~!!@#$%^&*()-+=\|/?.>,<' false nil) ('a\W+c' 'a c' true (1 'a c') 'a bc' false nil) ('\W+' 'foo!!@#$bar' true (1 '!!@#$') 'foobar' false nil) ('a\s*c' 'a c' true (1 'a c') 'a bc' false nil) ('\s+' 'abc3457 sd' true (1 ' ') '1234$^*^&asdfb' false nil) ('a\S*c' 'aqwertyc' true (1 'aqwertyc') 'ab c' false nil) ('\S+' ' asdf ' true (1 'asdf') ' ' false nil) ('a\d+c' 'a0123456789c' true (1 'a0123456789c') 'a12b34c' false nil) ('\d+' 'foo@#$%123ASD #$$%^&' true (1 '123') 'foo!!@#$asdfl;' false nil) ('a\D+c' 'aqwertyc' true (1 'aqwertyc') 'aqw6ertc' false nil) ('\D+' '1234 abc 456' true (1 ' abc ') '1234567890' false nil) ('(f|o)+\b' 'foo' true (1 'foo') ' foo ' true (1 'foo')) ('\ba\w+' "a word beginning with an A" 'land ancient' true (1 'ancient') 'antique vase' true (1 'antique') 'goofy foobar' false nil) ('(f|o)+\B' 'quuxfoobar' true (1 'foo') 'quuxfoo ' true (1 'fo')) ('\Ba\w+' "a word with an A in the middle, match at A and further" 'land ancient' true (1 'and') 'antique vase' true (1 'ase') 'smalltalk shall overcome' true (1 'alltalk') 'foonix is better' false nil) ('fooa\>.*' 'fooa ' true nil 'fooa123' false nil 'fooa bar' true nil 'fooa' true nil 'fooargh' false nil) ('\>.+abc' ' abcde fg' false nil 'foo abcde' true (1 ' abc') 'abcde' false nil) ('\ ::= '(' ')' " self match: $(. atom := self regex. self match: $). ^atom]. lookahead = $[ ifTrue: [" ::= '[' ']' " self match: $[. atom := self characterSet. self match: $]. ^atom]. lookahead = $: ifTrue: [" ::= ':' ':' " self match: $:. atom := self messagePredicate. self match: $:. ^atom]. lookahead = $. ifTrue: ["any non-whitespace character" self next. ^RxsContextCondition new beAny]. lookahead = $^ ifTrue: ["beginning of line condition" self next. ^RxsContextCondition new beBeginningOfLine]. lookahead = $$ ifTrue: ["end of line condition" self next. ^RxsContextCondition new beEndOfLine]. lookahead = $\ ifTrue: [" ::= '\' " self next. lookahead = #epsilon ifTrue: [self signalParseError: 'bad quotation']. (BackslashConstants includesKey: lookahead) ifTrue: [atom := RxsCharacter with: (BackslashConstants at: lookahead). self next. ^atom]. self ifSpecial: lookahead then: [:node | self next. ^node]]. "If passed through the above, the following is a regular character." atom := RxsCharacter with: lookahead. self next. ^atom! ! !RxParser methodsFor: 'recursive descent' stamp: ''! branch " ::= e | " | piece branch | piece := self piece. (lookahead = #epsilon or: [lookahead = $| or: [lookahead = $) ]]) ifTrue: [branch := nil] ifFalse: [branch := self branch]. ^RxsBranch new initializePiece: piece branch: branch! ! !RxParser methodsFor: 'recursive descent' stamp: ''! characterSet "Match a range of characters: something between `[' and `]'. Opening bracked has already been seen, and closing should not be consumed as well. Set spec is as usual for sets in regexes." | spec errorMessage | errorMessage := ' no terminating "]"'. spec := self inputUpTo: $] nestedOn: $[ errorMessage: errorMessage. (spec isEmpty or: [spec = '^']) ifTrue: "This ']' was literal." [self next. spec := spec, ']', (self inputUpTo: $] nestedOn: $[ errorMessage: errorMessage)]. ^self characterSetFrom: spec! ! !RxParser methodsFor: 'private' stamp: 'PeterHugossonMiller 9/2/2009 15:53'! characterSetFrom: setSpec " is what goes between the brackets in a charset regex (a String). Make a string containing all characters the spec specifies. Spec is never empty." | negated spec | spec := setSpec readStream. spec peek = $^ ifTrue: [negated := true. spec next] ifFalse: [negated := false]. ^RxsCharSet new initializeElements: (RxCharSetParser on: spec) parse negated: negated! ! !RxParser methodsFor: 'private' stamp: ''! ifSpecial: aCharacter then: aBlock "If the character is such that it defines a special node when follows a $\, then create that node and evaluate aBlock with the node as the parameter. Otherwise just return." | classAndSelector | classAndSelector := BackslashSpecials at: aCharacter ifAbsent: [^self]. ^aBlock value: (classAndSelector key new perform: classAndSelector value)! ! !RxParser methodsFor: 'private' stamp: 'PeterHugossonMiller 9/3/2009 11:08'! inputUpTo: aCharacter errorMessage: aString "Accumulate input stream until is encountered and answer the accumulated chars as String, not including . Signal error if end of stream is encountered, passing as the error description." | accumulator | accumulator := (String new: 20) writeStream. [lookahead ~= aCharacter and: [lookahead ~= #epsilon]] whileTrue: [accumulator nextPut: lookahead. self next]. lookahead = #epsilon ifTrue: [self signalParseError: aString]. ^accumulator contents! ! !RxParser methodsFor: 'private' stamp: 'PeterHugossonMiller 9/3/2009 11:08'! inputUpTo: aCharacter nestedOn: anotherCharacter errorMessage: aString "Accumulate input stream until is encountered and answer the accumulated chars as String, not including . Signal error if end of stream is encountered, passing as the error description." | accumulator nestLevel | accumulator := (String new: 20) writeStream. nestLevel := 0. [lookahead ~= aCharacter or: [nestLevel > 0]] whileTrue: [#epsilon = lookahead ifTrue: [self signalParseError: aString]. accumulator nextPut: lookahead. lookahead = anotherCharacter ifTrue: [nestLevel := nestLevel + 1]. lookahead = aCharacter ifTrue: [nestLevel := nestLevel - 1]. self next]. ^accumulator contents! ! !RxParser methodsFor: 'private' stamp: ''! match: aCharacter " MUST match the current lookeahead. If this is the case, advance the input. Otherwise, blow up." aCharacter ~= lookahead ifTrue: [^self signalParseError]. "does not return" self next! ! !RxParser methodsFor: 'recursive descent' stamp: ''! messagePredicate "Match a message predicate specification: a selector (presumably understood by a Character) enclosed in :'s ." | spec negated | spec := (self inputUpTo: $: errorMessage: ' no terminating ":"'). negated := false. spec first = $^ ifTrue: [negated := true. spec := spec copyFrom: 2 to: spec size]. ^RxsMessagePredicate new initializeSelector: spec asSymbol negated: negated! ! !RxParser methodsFor: 'private' stamp: ''! next "Advance the input storing the just read character as the lookahead." input atEnd ifTrue: [lookahead := #epsilon] ifFalse: [lookahead := input next]! ! !RxParser methodsFor: 'accessing' stamp: 'PeterHugossonMiller 9/2/2009 15:54'! parse: aString "Parse input from a string . On success, answers an RxsRegex -- parse tree root. On error, raises `RxParser syntaxErrorSignal' with the current input stream position as the parameter." ^self parseStream: aString readStream! ! !RxParser methodsFor: 'accessing' stamp: ''! parseStream: aStream "Parse an input from a character stream . On success, answers an RxsRegex -- parse tree root. On error, raises `RxParser syntaxErrorSignal' with the current input stream position as the parameter." | tree | input := aStream. lookahead := nil. self match: nil. tree := self regex. self match: #epsilon. ^tree! ! !RxParser methodsFor: 'recursive descent' stamp: ''! piece " ::= | * | + | ?" | atom errorMessage | errorMessage := ' nullable closure'. atom := self atom. lookahead = $* ifTrue: [self next. atom isNullable ifTrue: [self signalParseError: errorMessage]. ^RxsPiece new initializeStarAtom: atom]. lookahead = $+ ifTrue: [self next. atom isNullable ifTrue: [self signalParseError: errorMessage]. ^RxsPiece new initializePlusAtom: atom]. lookahead = $? ifTrue: [self next. atom isNullable ifTrue: [self signalParseError: errorMessage]. ^RxsPiece new initializeOptionalAtom: atom]. ^RxsPiece new initializeAtom: atom! ! !RxParser methodsFor: 'recursive descent' stamp: ''! regex " ::= e | `|' " | branch regex | branch := self branch. (lookahead = #epsilon or: [lookahead = $)]) ifTrue: [regex := nil] ifFalse: [self match: $|. regex := self regex]. ^RxsRegex new initializeBranch: branch regex: regex! ! !RxParser methodsFor: 'private' stamp: ''! signalParseError self class signalSyntaxException: 'Regex syntax error'! ! !RxParser methodsFor: 'private' stamp: ''! signalParseError: aString self class signalSyntaxException: aString! ! Object subclass: #RxmLink instanceVariableNames: 'next' classVariableNames: '' poolDictionaries: '' category: 'VB-Regex'! !RxmLink commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- A matcher is built of a number of links interconnected into some intricate structure. Regardless of fancy stuff, any link (except for the terminator) has the next one. Any link can match against a stream of characters, recursively propagating the match to the next link. Any link supports a number of matcher-building messages. This superclass does all of the above. The class is not necessarily abstract. It may double as an empty string matcher: it recursively propagates the match to the next link, thus always matching nothing successfully. Principal method: matchAgainst: aMatcher Any subclass will reimplement this to test the state of the matcher, most probably reading one or more characters from the matcher's stream, and either decide it has matched and answer true, leaving matcher stream positioned at the end of match, or answer false and restore the matcher stream position to whatever it was before the matching attempt. Instance variables: next The next link in the structure.! RxmLink subclass: #RxmBranch instanceVariableNames: 'loopback alternative' classVariableNames: '' poolDictionaries: '' category: 'VB-Regex'! !RxmBranch commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- This is a branch of a matching process. Either `next' chain should match, or `alternative', if not nil, should match. Since this is also used to build loopbacks to match repetitions, `loopback' variable indicates whether the instance is a loopback: it affects the matcher-building operations (which of the paths through the branch is to consider as the primary when we have to find the "tail" of a matcher construct). Instance variables alternative to match if `next' fails to match. loopback ! !RxmBranch methodsFor: 'initialize-release' stamp: ''! alternative: aBranch "See class comment for instance variable description." alternative := aBranch! ! !RxmBranch methodsFor: 'initialize-release' stamp: ''! beLoopback "See class comment for instance variable description." loopback := true! ! !RxmBranch methodsFor: 'initialize-release' stamp: 'alain.plantec 5/28/2009 10:18'! initialize "See class comment for instance variable description." super initialize. loopback := false! ! !RxmBranch methodsFor: 'matching' stamp: ''! matchAgainst: aMatcher "Match either `next' or `alternative'. Fail if the alternative is nil." ^(next matchAgainst: aMatcher) or: [alternative notNil and: [alternative matchAgainst: aMatcher]]! ! !RxmBranch methodsFor: 'building' stamp: ''! pointTailTo: aNode "See superclass for explanations." loopback ifTrue: [alternative == nil ifTrue: [alternative := aNode] ifFalse: [alternative pointTailTo: aNode]] ifFalse: [super pointTailTo: aNode]! ! !RxmBranch methodsFor: 'building' stamp: ''! terminateWith: aNode "See superclass for explanations." loopback ifTrue: [alternative == nil ifTrue: [alternative := aNode] ifFalse: [alternative terminateWith: aNode]] ifFalse: [super terminateWith: aNode]! ! !RxmLink class methodsFor: 'instance creation' stamp: ''! new ^super new initialize! ! !RxmLink methodsFor: 'matching' stamp: ''! matchAgainst: aMatcher "If a link does not match the contents of the matcher's stream, answer false. Otherwise, let the next matcher in the chain match." ^next matchAgainst: aMatcher! ! !RxmLink methodsFor: 'matching' stamp: ''! next ^next! ! !RxmLink methodsFor: 'initialize-release' stamp: ''! next: aLink "Set the next link, either an RxmLink or an RxmTerminator." next := aLink! ! !RxmLink methodsFor: 'building' stamp: ''! pointTailTo: anRxmLink "Propagate this message along the chain of links. Point `next' reference of the last link to . If the chain is already terminated, blow up." next == nil ifTrue: [next := anRxmLink] ifFalse: [next pointTailTo: anRxmLink]! ! !RxmLink methodsFor: 'building' stamp: ''! terminateWith: aTerminator "Propagate this message along the chain of links, and make aTerminator the `next' link of the last link in the chain. If the chain is already reminated with the same terminator, do not blow up." next == nil ifTrue: [next := aTerminator] ifFalse: [next terminateWith: aTerminator]! ! RxmLink subclass: #RxmMarker instanceVariableNames: 'index' classVariableNames: '' poolDictionaries: '' category: 'VB-Regex'! !RxmMarker commentStamp: '' prior: 0! -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov -- See `documentation' protocol of RxParser class for user's guide. -- A marker is used to remember positions of match of certain points of a regular expression. The marker receives an identifying key from the Matcher and uses that key to report positions of successful matches to the Matcher. Instance variables: index