Improved version matches spaces between name and equals sign. Passes new test case 13.
Code: Select all
<?php // test.php Rev:2011-01-14_22:00
// Regular expression goodness:
$re = '% # Match <A>..</A> tag having magic attribute=value pair.
<a\b # Opening literal chars for <A> tag.
# Match non-magic attribute/value pairs before magic attribute:
(?> # Zero or more non-"MY_HREF" attributes
\s++ # Attributes separated by whitespace
(?! # Look ahead to make sure attribute not magic.
href\s*+=\s*+ # Magic attribute is named HREF.
(?:([\'"]))?+ # $1: Optional value open quote delimiter.
\[MYPERSONALTAG\] # Magic attribute value is "[MYPERSONALTAG]".
(?(1)\1) # If there was an open quote, match closing quote.
) # If this is not the magic attribute, proceed.
[\w\-.:]++ # Required non-magic attribute name (HTML).
(?> # Optional attribute value
\s*+ = \s*+ # Attribute value requires equals sign delimiter.
(?> # Value can be one of three quoted alternatives:
\'[^\']*+\' # Single quoted value
| "[^"]*+" # or double quoted value
| [\w\-.:]++ # or un-quoted value.
) # End group of attribute value alternatives.
)?+ # End optional attribute value.
)*+ # Zero or more non-magic attribute/values.
# Match the magic attribute/value pair:
\s++ # whitespace before magic attribute name
href\s*+=\s*+ # Magic attribute name.
(?>([\'"]))?+ # $2: Magic value optional open quote delimiter.
\[MYPERSONALTAG\] # Magic attribute value = "MYPERSONALTAG".
(?(2)\2) # If there was an open quote, match closing quote.
# Match attribute/value pairs after magic attribute:
(?> # Zero or more attributes.
\s++ # Attributes separated by whitespace
[\w\-.:]++ # Required non-magic attribute name.
(?: # Optional attribute value
\s*+ = \s*+ # Attribute value requires equals sign delimiter.
(?: # Value can be one of three quoted alternatives:
\'[^\']*+\' # Single quoted value
| "[^"]*+" # or double quoted value
| [\w\-.:]++ # or un-quoted value.
) # End group of attribute value alternatives.
)?+ # End optional attribute value.
)*+ # Zero or more non-magic attribute/values.
\s*> # Whitespace and closing delimiter of <A> open tag.
# Match <A> tag contents.
(?> # Zero or more of one of the following two alternatives:
[^<]++(?:(?!</a\s*+>)<[^<]*+)*+ # Either start on a non-"<", match one or more non-"<".
| (?:(?!</a\s*+>)<[^<]*+)++ # or start on a "<" (non-"</a>"), and zero or more non-"<".
)*+ #
</a\s*+> # Closing literal chars.
%ix'; // This regex uses: i="ignorecase" and x="free-spacing" modes.
// Test data.
$data = '
VALID LINKS TO BE MATCHED:
<a href=[MYPERSONALTAG]>test 001</a>
<a href="[MYPERSONALTAG]">test 002</a>
<a href=\'[MYPERSONALTAG]\'>test 003</a>
<a class=before href=[MYPERSONALTAG]>test 004</a>
<a class="before" href=[MYPERSONALTAG]>test 005</a>
<a class=\'before\' href=[MYPERSONALTAG]>test 006</a>
<a href=[MYPERSONALTAG] class=after>test 007</a>
<a href=[MYPERSONALTAG] class="after">test 008</a>
<a href=[MYPERSONALTAG] class=\'after\'>test 009</a>
<a class=before href=[MYPERSONALTAG] class=after>test 010</a>
<a class="before" href=[MYPERSONALTAG] class="after">test 011</a>
<a class=\'before\' href=[MYPERSONALTAG] class=\'after\'>test 012</a>
<a class ="before" href=[MYPERSONALTAG] class=\'after\'>test 013</a>
INVALID LINKS NOT TO BE MATCHED:
<a href=[NOTMYPERSONALTAG]>test 001</a>
<a href="[NOTMYPERSONALTAG]">test 002</a>
<a href=\'[NOTMYPERSONALTAG]\'>test 003</a>
<a class=before href=[NOTMYPERSONALTAG]>test 004</a>
<a class="before" href=[NOTMYPERSONALTAG]>test 005</a>
<a class=\'before\' href=[NOTMYPERSONALTAG]>test 006</a>
<a href=[NOTMYPERSONALTAG] class=after>test 007</a>
<a href=[NOTMYPERSONALTAG] class="after">test 008</a>
<a href=[NOTMYPERSONALTAG] class=\'after\'>test 009</a>
<a class=before href=[NOTMYPERSONALTAG] class=after>test 010</a>
<a class="before" href=[NOTMYPERSONALTAG] class="after">test 011</a>
<a class=\'before\' href=[NOTMYPERSONALTAG] class=\'after\'>test 012</a>
';
// Process the $data through the regex $re. Matches are placed in $matches.
$count = preg_match_all($re, $data, $matches);
if ($count > 0) {
printf("There were %d matches:\n", $count);
for ($i = 0; $i < $count; ++$i) { // $matches[0] is array of all complete matches.
printf("Match %d of %d:\t\"%s\".\n", $i + 1, $count, $matches[0][$i]);
}
} else echo("There were no matches.\n");
?>
And the new output with extra test case.
[text]There were 13 matches:
Match 1 of 13: "<a href=[MYPERSONALTAG]>test 001</a>".
Match 2 of 13: "<a href="[MYPERSONALTAG]">test 002</a>".
Match 3 of 13: "<a href='[MYPERSONALTAG]'>test 003</a>".
Match 4 of 13: "<a class=before href=[MYPERSONALTAG]>test 004</a>".
Match 5 of 13: "<a class="before" href=[MYPERSONALTAG]>test 005</a>".
Match 6 of 13: "<a class='before' href=[MYPERSONALTAG]>test 006</a>".
Match 7 of 13: "<a href=[MYPERSONALTAG] class=after>test 007</a>".
Match 8 of 13: "<a href=[MYPERSONALTAG] class="after">test 008</a>".
Match 9 of 13: "<a href=[MYPERSONALTAG] class='after'>test 009</a>".
Match 10 of 13: "<a class=before href=[MYPERSONALTAG] class=after>test 010</a>".
Match 11 of 13: "<a class="before" href=[MYPERSONALTAG] class="after">test 011</a>".
Match 12 of 13: "<a class='before' href=[MYPERSONALTAG] class='after'>test 012</a>".
Match 13 of 13: "<a class ="before" href=[MYPERSONALTAG] class='after'>test 013</a>".[/text]
