From 2ef97f83f10f5610f56cfb3aa254347f69e9d759 Mon Sep 17 00:00:00 2001 From: Starmind <31166558+Starmys@users.noreply.github.com> Date: Sat, 17 Jun 2023 02:33:53 +0800 Subject: [PATCH] Upgrade Wikimedia CSS parser (#2126) Fixes #2119 --- .../Wikimedia/CSS/Grammar/Alternative.php | 3 +- .../Wikimedia/CSS/Grammar/AnythingMatcher.php | 15 +- .../Wikimedia/CSS/Grammar/BlockMatcher.php | 7 +- .../Wikimedia/CSS/Grammar/CheckedMatcher.php | 5 +- .../Wikimedia/CSS/Grammar/DelimMatcher.php | 5 +- .../Wikimedia/CSS/Grammar/FunctionMatcher.php | 12 +- .../Grammar/{Match.php => GrammarMatch.php} | 35 +- .../Wikimedia/CSS/Grammar/Juxtaposition.php | 44 +- .../Wikimedia/CSS/Grammar/KeywordMatcher.php | 5 +- .../Wikimedia/CSS/Grammar/Matcher.php | 103 ++- .../Wikimedia/CSS/Grammar/MatcherFactory.php | 470 ++++++++---- .../Wikimedia/CSS/Grammar/NoWhitespace.php | 3 +- .../Wikimedia/CSS/Grammar/NonEmpty.php | 5 +- .../Wikimedia/CSS/Grammar/NothingMatcher.php | 5 +- .../Wikimedia/CSS/Grammar/Quantifier.php | 41 +- .../Wikimedia/CSS/Grammar/TokenMatcher.php | 5 +- .../Wikimedia/CSS/Grammar/UnorderedGroup.php | 40 +- .../Wikimedia/CSS/Grammar/UrangeMatcher.php | 98 +++ .../Wikimedia/CSS/Grammar/UrlMatcher.php | 14 +- .../CSS/Grammar/WhitespaceMatcher.php | 5 +- .../Wikimedia/CSS/Objects/AtRule.php | 14 +- .../Wikimedia/CSS/Objects/CSSFunction.php | 11 +- .../Wikimedia/CSS/Objects/CSSObjectList.php | 84 ++- .../Wikimedia/CSS/Objects/ComponentValue.php | 8 +- .../CSS/Objects/ComponentValueList.php | 11 +- .../Wikimedia/CSS/Objects/Declaration.php | 19 +- .../Wikimedia/CSS/Objects/DeclarationList.php | 8 +- .../CSS/Objects/DeclarationOrAtRuleList.php | 4 + .../Wikimedia/CSS/Objects/QualifiedRule.php | 15 +- .../Wikimedia/CSS/Objects/Rule.php | 11 +- .../Wikimedia/CSS/Objects/RuleList.php | 4 + .../Wikimedia/CSS/Objects/SimpleBlock.php | 17 +- .../Wikimedia/CSS/Objects/Stylesheet.php | 8 +- .../Wikimedia/CSS/Objects/Token.php | 312 ++++---- .../Wikimedia/CSS/Objects/TokenList.php | 16 +- .../Wikimedia/CSS/Parser/DataSource.php | 2 +- .../CSS/Parser/DataSourceTokenizer.php | 287 +++----- .../Wikimedia/CSS/Parser/Encoder.php | 59 +- .../Wikimedia/CSS/Parser/Parser.php | 272 ++++--- .../Wikimedia/CSS/Parser/StringDataSource.php | 32 +- .../CSS/Parser/TokenListTokenizer.php | 8 +- .../CSS/Sanitizer/FontFaceAtRuleSanitizer.php | 41 +- .../FontFeatureValueAtRuleSanitizer.php | 83 --- .../FontFeatureValuesAtRuleSanitizer.php | 84 --- .../CSS/Sanitizer/ImportAtRuleSanitizer.php | 27 +- .../Sanitizer/KeyframesAtRuleSanitizer.php | 17 +- .../CSS/Sanitizer/MarginAtRuleSanitizer.php | 8 +- .../CSS/Sanitizer/MediaAtRuleSanitizer.php | 9 +- .../Sanitizer/NamespaceAtRuleSanitizer.php | 11 +- .../CSS/Sanitizer/PageAtRuleSanitizer.php | 43 +- .../CSS/Sanitizer/PropertySanitizer.php | 14 +- .../Wikimedia/CSS/Sanitizer/RuleSanitizer.php | 11 +- .../Wikimedia/CSS/Sanitizer/Sanitizer.php | 21 +- .../CSS/Sanitizer/StyleAttributeSanitizer.php | 8 +- .../CSS/Sanitizer/StylePropertySanitizer.php | 667 +++++++++--------- .../CSS/Sanitizer/StyleRuleSanitizer.php | 92 ++- .../CSS/Sanitizer/StylesheetSanitizer.php | 13 +- .../CSS/Sanitizer/SupportsAtRuleSanitizer.php | 95 +-- lib/css-sanitizer/Wikimedia/CSS/Util.php | 44 +- 59 files changed, 1798 insertions(+), 1612 deletions(-) rename lib/css-sanitizer/Wikimedia/CSS/Grammar/{Match.php => GrammarMatch.php} (78%) create mode 100644 lib/css-sanitizer/Wikimedia/CSS/Grammar/UrangeMatcher.php delete mode 100644 lib/css-sanitizer/Wikimedia/CSS/Sanitizer/FontFeatureValueAtRuleSanitizer.php delete mode 100644 lib/css-sanitizer/Wikimedia/CSS/Sanitizer/FontFeatureValuesAtRuleSanitizer.php diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Alternative.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Alternative.php index bb683d3cd..2d103b2b7 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Alternative.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Alternative.php @@ -11,7 +11,7 @@ use Wikimedia\CSS\Util; /** * Matcher that matches one out of a set of Matchers ("|" combiner). - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#comb-one + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#comb-one */ class Alternative extends Matcher { /** @var Matcher[] */ @@ -25,6 +25,7 @@ class Alternative extends Matcher { $this->matchers = $matchers; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $used = []; foreach ( $this->matchers as $matcher ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/AnythingMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/AnythingMatcher.php index 1e074a121..52c7b6959 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/AnythingMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/AnythingMatcher.php @@ -6,6 +6,8 @@ namespace Wikimedia\CSS\Grammar; +use InvalidArgumentException; +use UnexpectedValueException; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\CSSFunction; use Wikimedia\CSS\Objects\SimpleBlock; @@ -15,7 +17,7 @@ use Wikimedia\CSS\Objects\Token; * Matcher that matches anything except bad strings, bad urls, and unmatched * left-paren, left-brace, or left-bracket. * @warning Be very careful using this! - * @see https://drafts.csswg.org/css-syntax/#any-value for where this roughly comes from. + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#any-value */ class AnythingMatcher extends Matcher { @@ -42,9 +44,9 @@ class AnythingMatcher extends Matcher { */ public function __construct( array $options = [] ) { $this->toplevel = !empty( $options['toplevel'] ); - $this->quantifier = isset( $options['quantifier'] ) ? $options['quantifier'] : ''; + $this->quantifier = $options['quantifier'] ?? ''; if ( !in_array( $this->quantifier, [ '', '+', '*' ], true ) ) { - throw new \InvalidArgumentException( 'Invalid quantifier' ); + throw new InvalidArgumentException( 'Invalid quantifier' ); } $recurse = !$this->toplevel && $this->quantifier === '*' @@ -55,12 +57,13 @@ class AnythingMatcher extends Matcher { } } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $origStart = $start; $lastMatch = $this->quantifier === '*' ? $this->makeMatch( $values, $start, $start ) : null; do { $newMatch = null; - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof Token ) { switch ( $cv->type() ) { case Token::T_BAD_STRING: @@ -92,7 +95,7 @@ class AnythingMatcher extends Matcher { // If we encounter whitespace, assume it's significant. $newMatch = $this->makeMatch( $values, $origStart, $this->next( $values, $start, $options ), - new Match( $values, $start, 1, 'significantWhitespace' ), + new GrammarMatch( $values, $start, 1, 'significantWhitespace' ), [ [ $lastMatch ] ] ); break; @@ -103,7 +106,7 @@ class AnythingMatcher extends Matcher { case Token::T_LEFT_BRACKET: // Should never happen // @codeCoverageIgnoreStart - throw new \UnexpectedValueException( "How did a \"{$cv->type()}\" token get here?" ); + throw new UnexpectedValueException( "How did a \"{$cv->type()}\" token get here?" ); // @codeCoverageIgnoreEnd default: diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/BlockMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/BlockMatcher.php index f217057bb..5ca25ac7f 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/BlockMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/BlockMatcher.php @@ -6,9 +6,9 @@ namespace Wikimedia\CSS\Grammar; +use InvalidArgumentException; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\SimpleBlock; -use Wikimedia\CSS\Objects\Token; /** * Matcher that matches a SimpleBlock @@ -34,7 +34,7 @@ class BlockMatcher extends Matcher { */ public function __construct( $blockType, Matcher $matcher ) { if ( SimpleBlock::matchingDelimiter( $blockType ) === null ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( 'A block is delimited by either {}, [], or ().' ); } @@ -42,8 +42,9 @@ class BlockMatcher extends Matcher { $this->matcher = $matcher; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof SimpleBlock && $cv->getStartTokenType() === $this->blockType ) { // To successfully match, our sub-Matcher needs to match the whole // content of the block. diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/CheckedMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/CheckedMatcher.php index 2ab084c66..dc0e90920 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/CheckedMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/CheckedMatcher.php @@ -13,7 +13,7 @@ use Wikimedia\CSS\Objects\ComponentValueList; */ class CheckedMatcher extends Matcher { /** @var Matcher */ - private $matcher = null; + private $matcher; /** @var callable */ protected $check; @@ -21,13 +21,14 @@ class CheckedMatcher extends Matcher { /** * @param Matcher $matcher Base matcher * @param callable $check Function to check the match is really valid. - * Prototype is bool func( ComponentValueList $values, Match $match, array $options ) + * Prototype is bool func( ComponentValueList $values, GrammarMatch $match, array $options ) */ public function __construct( Matcher $matcher, callable $check ) { $this->matcher = $matcher; $this->check = $check; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { foreach ( $this->matcher->generateMatches( $values, $start, $options ) as $match ) { if ( call_user_func( $this->check, $values, $match, $options ) ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/DelimMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/DelimMatcher.php index 11061c86d..b6dbcf405 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/DelimMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/DelimMatcher.php @@ -16,7 +16,7 @@ use Wikimedia\CSS\Objects\Token; * other types (case-sensitively) too. For the more common case-insensitive * identifier matching, use KeywordMatcher. * - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-types + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-types */ class DelimMatcher extends Matcher { /** @var string One of the Token::T_* constants */ @@ -39,8 +39,9 @@ class DelimMatcher extends Matcher { $this->type = $options['type']; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof Token && $cv->type() === $this->type && in_array( $cv->value(), $this->values, true ) ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/FunctionMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/FunctionMatcher.php index 7269ce337..5e8d3f380 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/FunctionMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/FunctionMatcher.php @@ -6,9 +6,10 @@ namespace Wikimedia\CSS\Grammar; +use Closure; +use InvalidArgumentException; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\CSSFunction; -use Wikimedia\CSS\Objects\Token; /** * Matcher that matches a CSSFunction @@ -29,25 +30,26 @@ class FunctionMatcher extends Matcher { protected $matcher; /** - * @param string|callable|null $name Function name, case-insensitive, or a + * @param string|Closure|null $name Function name, case-insensitive, or a * function to check the name. * @param Matcher $matcher Matcher for the contents of the function */ public function __construct( $name, Matcher $matcher ) { if ( is_string( $name ) ) { - $this->nameCheck = function ( $s ) use ( $name ) { + $this->nameCheck = static function ( $s ) use ( $name ) { return !strcasecmp( $s, $name ); }; } elseif ( is_callable( $name ) || $name === null ) { $this->nameCheck = $name; } else { - throw new \InvalidArgumentException( '$name must be a string, callable, or null' ); + throw new InvalidArgumentException( '$name must be a string, callable, or null' ); } $this->matcher = $matcher; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof CSSFunction && ( !$this->nameCheck || call_user_func( $this->nameCheck, $cv->getName() ) ) ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Match.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/GrammarMatch.php similarity index 78% rename from lib/css-sanitizer/Wikimedia/CSS/Grammar/Match.php rename to lib/css-sanitizer/Wikimedia/CSS/Grammar/GrammarMatch.php index 93e64a99b..bfdfd0bf2 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Match.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/GrammarMatch.php @@ -8,18 +8,19 @@ namespace Wikimedia\CSS\Grammar; use Wikimedia\CSS\Objects\ComponentValue; use Wikimedia\CSS\Objects\ComponentValueList; -use Wikimedia\CSS\Objects\CSSFunction; -use Wikimedia\CSS\Objects\SimpleBlock; use Wikimedia\CSS\Objects\Token; use Wikimedia\CSS\Util; /** * Represent a match from a Matcher. */ -class Match { +class GrammarMatch { /** @var int */ - protected $start, $length; + protected $start; + + /** @var int */ + protected $length; /** @var ComponentValue[] Matched ComponentValues */ protected $values; @@ -27,7 +28,7 @@ class Match { /** @var string|null */ protected $name = null; - /** @var Match[] Captured submatches */ + /** @var GrammarMatch[] Captured submatches */ protected $capturedMatches = []; /** @@ -35,12 +36,12 @@ class Match { * @param int $start Starting index of the match. * @param int $length Number of tokens in the match. * @param string|null $name Give a name to this match. - * @param Match[] $capturedMatches Captured submatches of this match. + * @param GrammarMatch[] $capturedMatches Captured submatches of this match. */ public function __construct( ComponentValueList $list, $start, $length, $name = null, array $capturedMatches = [] ) { - Util::assertAllInstanceOf( $capturedMatches, Match::class, '$capturedMatches' ); + Util::assertAllInstanceOf( $capturedMatches, self::class, '$capturedMatches' ); $this->values = $list->slice( $start, $length ); $this->start = $start; @@ -95,21 +96,21 @@ class Match { * This returns the matches from capturing submatchers (see * Matcher::capture()) that matched during the matching of the top-level * matcher that returned this match. If capturing submatchers were nested, - * the Match objects returned here will themselves have captured submatches to - * return. + * the GrammarMatch objects returned here will themselves have captured sub- + * matches to return. * * To borrow PCRE regular expression syntax, if the "pattern" described by * the Matchers resembled `www(?xxx(?yyy)xxx)(?zzz)*` then the - * top-level Match's getCapturedMatches() would return a Match named "A" - * (containing the "xxxyyyxxx" bit) and zero or more matches named "C" (for - * each "zzz"), and that "A" Match's getCapturedMatches() would return a Match - * named "B" (containing just the "yyy"). + * top-level GrammarMatch's getCapturedMatches() would return a GrammarMatch + * named "A" (containing the "xxxyyyxxx" bit) and zero or more matches named + * "C" (for each "zzz"), and that "A" GrammarMatch's getCapturedMatches() + * would return a GrammarMatch named "B" (containing just the "yyy"). * * Note that the start and end positions reported by captured matches may be * relative to a containing SimpleBlock or CSSFunction's value rather than * to the ComponentValueList passed to the top-level Matcher. * - * @return Match[] + * @return GrammarMatch[] */ public function getCapturedMatches() { return $this->capturedMatches; @@ -124,7 +125,7 @@ class Match { foreach ( $this->capturedMatches as $m ) { $data[] = $m->getUniqueId(); } - return md5( join( "\n", $data ) ); + return md5( implode( "\n", $data ) ); } /** @@ -143,4 +144,8 @@ class Match { $m->fixWhitespace( $old, $new ); } } + + public function __toString() { + return Util::stringify( $this->getValues() ); + } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Juxtaposition.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Juxtaposition.php index 2f851985b..d09440128 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Juxtaposition.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Juxtaposition.php @@ -6,14 +6,15 @@ namespace Wikimedia\CSS\Grammar; +use Iterator; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\Token; use Wikimedia\CSS\Util; /** * Matcher that groups other matchers (juxtaposition) - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-combinators - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#comb-comma + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-combinators + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#comb-comma */ class Juxtaposition extends Matcher { /** @var Matcher[] */ @@ -32,25 +33,25 @@ class Juxtaposition extends Matcher { $this->commas = (bool)$commas; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $used = []; // Match each of our matchers in turn, pushing each one onto a stack as - // we process it and popping a match once its exhausted. + // we process it and popping a match once it's exhausted. $stack = [ [ - new Match( $values, $start, 0 ), + new GrammarMatch( $values, $start, 0 ), $start, $this->matchers[0]->generateMatches( $values, $start, $options ), false ] ]; do { - /** @var $lastMatch Match */ /** @var $lastEnd int */ - /** @var $iter \Iterator */ + /** @var $iter Iterator */ /** @var $needEmpty bool */ - list( $lastMatch, $lastEnd, $iter, $needEmpty ) = $stack[count( $stack ) - 1]; + [ , $lastEnd, $iter, $needEmpty ] = $stack[count( $stack ) - 1]; // If the top of the stack has no more matches, pop it and loop. if ( !$iter->valid() ) { @@ -72,30 +73,29 @@ class Juxtaposition extends Matcher { $thisEnd = $nextFrom = $match->getNext(); // Dealing with commas is a bit tricky. There are three cases: - // 1. If the current match is empty, don't look for a following - // comma now and reset $thisEnd to $lastEnd. - // 2. If there is a comma following, update $nextFrom to be after - // the comma. - // 3. If there's no comma following, every subsequent Matcher must - // be empty in order for the group as a whole to match, so set - // the flag. + // 1. If the current match is empty, don't look for a following + // comma now and reset $thisEnd to $lastEnd. + // 2. If there is a comma following, update $nextFrom to be after + // the comma. + // 3. If there's no comma following, every subsequent Matcher must + // be empty in order for the group as a whole to match, so set + // the flag. // Unlike '#', this doesn't specify skipping whitespace around the // commas if the production isn't already skipping whitespace. if ( $this->commas ) { if ( $match->getLength() === 0 ) { $thisEnd = $lastEnd; + } elseif ( isset( $values[$nextFrom] ) && $values[$nextFrom] instanceof Token && + // @phan-suppress-next-line PhanNonClassMethodCall False positive + $values[$nextFrom]->type() === Token::T_COMMA + ) { + $nextFrom = $this->next( $values, $nextFrom, $options ); } else { - if ( isset( $values[$nextFrom] ) && $values[$nextFrom] instanceof Token && - $values[$nextFrom]->type() === Token::T_COMMA - ) { - $nextFrom = $this->next( $values, $nextFrom, $options ); - } else { - $needEmpty = true; - } + $needEmpty = true; } } - // If we ran out of Matchers, yield the final position. Otherwise + // If we ran out of Matchers, yield the final position. Otherwise, // push the next matcher onto the stack. if ( count( $stack ) >= count( $this->matchers ) ) { $newMatch = $this->makeMatch( $values, $start, $thisEnd, $match, $stack ); diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/KeywordMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/KeywordMatcher.php index 5854765f4..56989cbe5 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/KeywordMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/KeywordMatcher.php @@ -16,7 +16,7 @@ use Wikimedia\CSS\Objects\Token; * other types (case-insensitively) too. For delimiter (or case-sensitive) * matching, use DelimMatcher. * - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-types + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-types */ class KeywordMatcher extends Matcher { /** @var string One of the Token::T_* constants */ @@ -39,8 +39,9 @@ class KeywordMatcher extends Matcher { $this->type = $options['type']; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof Token && $cv->type() === $this->type && isset( $this->values[strtolower( $cv->value() )] ) ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Matcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Matcher.php index c314aa8fd..f120f04b8 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Matcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Matcher.php @@ -6,10 +6,11 @@ namespace Wikimedia\CSS\Grammar; +use Iterator; use Wikimedia\CSS\Objects\ComponentValueList; -use Wikimedia\CSS\Objects\Token; -use Wikimedia\CSS\Objects\SimpleBlock; use Wikimedia\CSS\Objects\CSSFunction; +use Wikimedia\CSS\Objects\SimpleBlock; +use Wikimedia\CSS\Objects\Token; /** * Base class for grammar matchers. @@ -20,16 +21,16 @@ use Wikimedia\CSS\Objects\CSSFunction; * object that will determine whether a ComponentValueList actually matches * this grammar. * - * [SYN3]: https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/ - * [VAL3]: https://www.w3.org/TR/2016/CR-css-values-3-20160929/ + * [SYN3]: https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/ + * [VAL3]: https://www.w3.org/TR/2019/CR-css-values-3-20190606/ */ abstract class Matcher { - /** @var string|null Name to set on Match objects */ + /** @var string|null Name to set on GrammarMatch objects */ protected $captureName = null; /** - * @var array Default options for self::match() + * @var array Default options for self::matchAgainst() * - skip-whitespace: (bool) Allow whitespace in between any two tokens * - nonterminal: (bool) Don't require the whole of $values is matched * - mark-significance: (bool) On a successful match, replace T_WHITESPACE @@ -43,49 +44,29 @@ abstract class Matcher { /** * Create an instance. - * @param mixed ... See static::__construct() + * @param mixed ...$args See static::__construct() * @return static */ - public static function create() { - // @todo Once we drop support for PHP 5.5, just do this: - // public static function create( ...$args ) { - // return new static( ...$args ); - // } - - $args = func_get_args(); - switch ( count( $args ) ) { - case 0: - return new static(); - case 1: - return new static( $args[0] ); - case 2: - return new static( $args[0], $args[1] ); - case 3: - return new static( $args[0], $args[1], $args[2] ); - case 4: - return new static( $args[0], $args[1], $args[2], $args[3] ); - default: - // Slow, but all the existing Matchers have a max of 4 args. - $rc = new \ReflectionClass( static::class ); - return $rc->newInstanceArgs( $args ); - } + public static function create( ...$args ) { + // @phan-suppress-next-line PhanParamTooManyUnpack,PhanTypeInstantiateAbstractStatic + return new static( ...$args ); } /** * Return a copy of this matcher that will capture its matches * - * A "capturing" Matcher will produce Matches that return a value from the - * Match::getName() method. The Match::getCapturedMatches() method may be - * used to retrieve them from the top-level Match. + * A "capturing" Matcher will produce GrammarMatches that return a value from + * the GrammarMatch::getName() method. The GrammarMatch::getCapturedMatches() + * method may be used to retrieve them from the top-level GrammarMatch. * * The concept is similar to capturing groups in PCRE and other regex * languages. * - * @param string|null $captureName Name to apply to captured Match objects + * @param string|null $captureName Name to apply to captured GrammarMatch objects * @return static */ public function capture( $captureName ) { - $ret = clone( $this ); + $ret = clone $this; $ret->captureName = $captureName; return $ret; } @@ -94,14 +75,14 @@ abstract class Matcher { * Match against a list of ComponentValues * @param ComponentValueList $values * @param array $options Matching options, see self::$defaultOptions - * @return Match|null + * @return GrammarMatch|null */ - public function match( ComponentValueList $values, array $options = [] ) { + public function matchAgainst( ComponentValueList $values, array $options = [] ) { $options += $this->getDefaultOptions(); $start = $this->next( $values, -1, $options ); $l = count( $values ); foreach ( $this->generateMatches( $values, $start, $options ) as $match ) { - if ( $match->getNext() === $l || $options['nonterminal'] ) { + if ( $options['nonterminal'] || $match->getNext() === $l ) { if ( $options['mark-significance'] ) { $significantWS = self::collectSignificantWhitespace( $match ); self::markSignificantWhitespace( $values, $match, $significantWS, $match->getNext() ); @@ -114,11 +95,11 @@ abstract class Matcher { /** * Collect any 'significantWhitespace' matches - * @param Match $match - * @param Token[]|null &$ret + * @param GrammarMatch $match + * @param Token[] &$ret * @return Token[] */ - private static function collectSignificantWhitespace( Match $match, &$ret = [] ) { + private static function collectSignificantWhitespace( GrammarMatch $match, &$ret = [] ) { if ( $match->getName() === 'significantWhitespace' ) { $ret = array_merge( $ret, $match->getValues() ); } @@ -131,7 +112,7 @@ abstract class Matcher { /** * Mark whitespace as significant or not * @param ComponentValueList $list - * @param Match $match + * @param GrammarMatch $match * @param Token[] $significantWS * @param int $end */ @@ -141,8 +122,9 @@ abstract class Matcher { if ( $cv instanceof Token && $cv->type() === Token::T_WHITESPACE ) { $significant = in_array( $cv, $significantWS, true ); if ( $significant !== $cv->significant() ) { - $list[$i] = $cv->copyWithSignificance( $significant ); - $match->fixWhitespace( $cv, $list[$i] ); + $newCv = $cv->copyWithSignificance( $significant ); + $match->fixWhitespace( $cv, $newCv ); + $list[$i] = $newCv; } } elseif ( $cv instanceof CSSFunction || $cv instanceof SimpleBlock ) { self::markSignificantWhitespace( @@ -186,27 +168,28 @@ abstract class Matcher { do { $i++; } while ( $skipWS && $i < $l && + // @phan-suppress-next-line PhanNonClassMethodCall False positive $values[$i] instanceof Token && $values[$i]->type() === Token::T_WHITESPACE ); return $i; } /** - * Create a Match + * Create a GrammarMatch * @param ComponentValueList $list * @param int $start * @param int $end First position after the match - * @param Match|null $submatch Submatch, for capturing. If $submatch itself - * named it will be kept as a capture in the returned Match, otherwise its - * captured matches (if any) as returned by getCapturedMatches() will be - * kept as captures in the returned Match. + * @param GrammarMatch|null $submatch Sub-match, for capturing. If $submatch + * itself named it will be kept as a capture in the returned GrammarMatch, + * otherwise its captured matches (if any) as returned by getCapturedMatches() + * will be kept as captures in the returned GrammarMatch. * @param array $stack Stack from which to fetch more submatches for * capturing (see $submatch). The stack is expected to be an array of - * arrays, with the first element of each subarray being a Match. - * @return Match + * arrays, with the first element of each subarray being a GrammarMatch. + * @return GrammarMatch */ protected function makeMatch( - ComponentValueList $list, $start, $end, Match $submatch = null, array $stack = [] + ComponentValueList $list, $start, $end, GrammarMatch $submatch = null, array $stack = [] ) { $matches = array_column( $stack, 0 ); $matches[] = $submatch; @@ -214,7 +197,7 @@ abstract class Matcher { $keptMatches = []; while ( $matches ) { $m = array_shift( $matches ); - if ( !$m instanceof Match ) { + if ( !$m instanceof GrammarMatch ) { // skip it, probably null } elseif ( $m->getName() !== null ) { $keptMatches[] = $m; @@ -223,7 +206,7 @@ abstract class Matcher { } } - return new Match( $list, $start, $end - $start, $this->captureName, $keptMatches ); + return new GrammarMatch( $list, $start, $end - $start, $this->captureName, $keptMatches ); } /** @@ -231,18 +214,18 @@ abstract class Matcher { * * The job of a Matcher is to determine all the ways its particular grammar * fragment can consume ComponentValues starting at a particular location - * in the ComponentValueList, represented by returning Match objects. For - * example, a matcher implementing `IDENT*` at a starting position where + * in the ComponentValueList, represented by returning GrammarMatch objects. + * For example, a matcher implementing `IDENT*` at a starting position where * there are three IDENT tokens in a row would be able to match 0, 1, 2, or * all 3 of those IDENT tokens, and therefore should return an iterator - * over that set of Match objects. + * over that set of GrammarMatch objects. * * Some matchers take other matchers as input, for example `IDENT*` is * probably going to be implemented as a matcher for `*` that repeatedly * applies a matcher for `IDENT`. The `*` matcher would call the `IDENT` * matcher's generateMatches() method directly. * - * Most Matchers implement this method as a generator so as to not build up + * Most Matchers implement this method as a generator to not build up * the full set of results when it's reasonably likely the caller is going * to terminate early. * @@ -250,8 +233,8 @@ abstract class Matcher { * @param int $start Starting position in $values * @param array $options See self::$defaultOptions. * Always use the options passed in, don't use $this->defaultOptions yourself. - * @return \Iterator Iterates over the set of Match objects - * defining all the ways this matcher can match. + * @return Iterator Iterates over the set of GrammarMatch + * objects defining all the ways this matcher can match. */ abstract protected function generateMatches( ComponentValueList $values, $start, array $options ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/MatcherFactory.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/MatcherFactory.php index d4a1e5851..335d6a670 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/MatcherFactory.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/MatcherFactory.php @@ -6,7 +6,10 @@ namespace Wikimedia\CSS\Grammar; +use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\Token; +use Wikimedia\CSS\Parser\Parser; +use Wikimedia\CSS\Sanitizer\PropertySanitizer; /** * Factory for predefined Grammar matchers @@ -16,12 +19,14 @@ class MatcherFactory { /** @var MatcherFactory|null */ private static $instance = null; - /** @var Matcher[] Cache of constructed matchers */ + /** @var (Matcher|Matcher[])[] Cache of constructed matchers */ protected $cache = []; /** @var string[] length units */ - protected static $lengthUnits = [ 'em', 'ex', 'ch', 'rem', 'vw', 'vh', - 'vmin', 'vmax', 'cm', 'mm', 'Q', 'in', 'pc', 'pt', 'px' ]; + protected static $lengthUnits = [ + 'em', 'ex', 'ch', 'rem', 'vw', 'vh', 'vmin', 'vmax', + 'cm', 'mm', 'Q', 'in', 'pc', 'pt', 'px' + ]; /** @var string[] angle units */ protected static $angleUnits = [ 'deg', 'grad', 'rad', 'turn' ]; @@ -87,9 +92,32 @@ class MatcherFactory { return $this->cache[__METHOD__]; } + /** + * Matcher for a + * + * Note this doesn't implement the semantic restriction about assigning + * meaning to various idents in a complex value, as CSS Sanitizer doesn't + * deal with semantics on that level. + * + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#identifier-value + * @param string[] $exclude Additional values to exclude, all-lowercase. + * @return Matcher + */ + public function customIdent( array $exclude = [] ) { + $exclude = array_merge( [ + // https://www.w3.org/TR/2019/CR-css-values-3-20190606/#common-keywords + 'initial', 'inherit', 'unset', 'default', + // https://www.w3.org/TR/2018/CR-css-cascade-4-20180828/#all-shorthand + 'revert' + ], $exclude ); + return new TokenMatcher( Token::T_IDENT, static function ( Token $t ) use ( $exclude ) { + return !in_array( strtolower( $t->value() ), $exclude, true ); + } ); + } + /** * Matcher for a string - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#strings + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#strings * @warning If the string will be used as a URL, use self::urlstring() instead. * @return Matcher */ @@ -112,7 +140,7 @@ class MatcherFactory { /** * Matcher for a URL - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#urls + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#urls * @param string $type Type of resource referenced, e.g. "image" or "audio". * Not used here, but might be used by a subclass to validate the URL more strictly. * @return Matcher @@ -126,24 +154,28 @@ class MatcherFactory { /** * CSS-wide value keywords - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#common-keywords + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#common-keywords * @return Matcher */ public function cssWideKeywords() { if ( !isset( $this->cache[__METHOD__] ) ) { - $this->cache[__METHOD__] = new KeywordMatcher( [ 'initial', 'inherit', 'unset' ] ); + $this->cache[__METHOD__] = new KeywordMatcher( [ + // https://www.w3.org/TR/2019/CR-css-values-3-20190606/#common-keywords + 'initial', 'inherit', 'unset', + // added by https://www.w3.org/TR/2018/CR-css-cascade-4-20180828/#all-shorthand + 'revert' + ] ); } return $this->cache[__METHOD__]; } /** - * Add calc() support to a basic type matcher - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#calc-notation + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#calc-notation * @param Matcher $typeMatcher Matcher for the type * @param string $type Type being matched - * @return Matcher + * @return Matcher[] */ - public function calc( Matcher $typeMatcher, $type ) { + protected function calcInternal( Matcher $typeMatcher, $type ) { if ( $type === 'integer' ) { $num = $this->rawInteger(); } else { @@ -164,13 +196,23 @@ class MatcherFactory { &$calcValue, Quantifier::star( new Juxtaposition( [ $ows, new DelimMatcher( '*' ), $ows, &$calcValue ] ) ) ] ); - } else { + } elseif ( $typeMatcher === $this->rawNumber() ) { $calcProduct = new Juxtaposition( [ &$calcValue, - Quantifier::star( new Alternative( [ - new Juxtaposition( [ $ows, new DelimMatcher( '*' ), $ows, &$calcValue ] ), - new Juxtaposition( [ $ows, new DelimMatcher( '/' ), $ows, $this->rawNumber() ] ), - ] ) ), + Quantifier::star( + new Juxtaposition( [ $ows, new DelimMatcher( [ '*', '/' ] ), $ows, &$calcValue ] ) + ), + ] ); + } else { + $calcNumValue = $this->calcInternal( $this->rawNumber(), 'number' )[1]; + $calcProduct = new Juxtaposition( [ + &$calcValue, + Quantifier::star( + new Alternative( [ + new Juxtaposition( [ $ows, new DelimMatcher( '*' ), $ows, &$calcValue ] ), + new Juxtaposition( [ $ows, new DelimMatcher( '/' ), $ows, $calcNumValue, ] ), + ] ) + ), ] ); } @@ -200,17 +242,31 @@ class MatcherFactory { ] ); } - return new Alternative( [ $typeMatcher, $calcFunc ] ); + return [ + new Alternative( [ $typeMatcher, $calcFunc ] ), + $calcValue, + ]; + } + + /** + * Add calc() support to a basic type matcher + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#calc-notation + * @param Matcher $typeMatcher Matcher for the type + * @param string $type Type being matched + * @return Matcher + */ + public function calc( Matcher $typeMatcher, $type ) { + return $this->calcInternal( $typeMatcher, $type )[0]; } /** * Matcher for an integer value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#integers + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#integers * @return Matcher */ protected function rawInteger() { if ( !isset( $this->cache[__METHOD__] ) ) { - $this->cache[__METHOD__] = new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + $this->cache[__METHOD__] = new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { // The spec says it must match /^[+-]\d+$/, but the tokenizer // should have marked any other number token as a 'number' // anyway so let's not bother checking. @@ -222,7 +278,7 @@ class MatcherFactory { /** * Matcher for an integer value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#integers + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#integers * @return Matcher */ public function integer() { @@ -234,7 +290,7 @@ class MatcherFactory { /** * Matcher for a real number, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#numbers + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#numbers * @return Matcher */ public function rawNumber() { @@ -246,7 +302,7 @@ class MatcherFactory { /** * Matcher for a real number - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#numbers + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#numbers * @return Matcher */ public function number() { @@ -258,7 +314,7 @@ class MatcherFactory { /** * Matcher for a percentage value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#percentages + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#percentages * @return Matcher */ public function rawPercentage() { @@ -270,7 +326,7 @@ class MatcherFactory { /** * Matcher for a percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#percentages + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#percentages * @return Matcher */ public function percentage() { @@ -282,7 +338,7 @@ class MatcherFactory { /** * Matcher for a length-percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#typedef-length-percentage + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-length-percentage * @return Matcher */ public function lengthPercentage() { @@ -297,7 +353,7 @@ class MatcherFactory { /** * Matcher for a frequency-percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#typedef-frequency-percentage + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-frequency-percentage * @return Matcher */ public function frequencyPercentage() { @@ -311,8 +367,8 @@ class MatcherFactory { } /** - * Matcher for a angle-percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#typedef-angle-percentage + * Matcher for an angle-percentage value + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-angle-percentage * @return Matcher */ public function anglePercentage() { @@ -327,7 +383,7 @@ class MatcherFactory { /** * Matcher for a time-percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#typedef-time-percentage + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-time-percentage * @return Matcher */ public function timePercentage() { @@ -342,7 +398,7 @@ class MatcherFactory { /** * Matcher for a number-percentage value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#typedef-number-percentage + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-number-percentage * @return Matcher */ public function numberPercentage() { @@ -357,7 +413,7 @@ class MatcherFactory { /** * Matcher for a dimension value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#dimensions + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#dimensions * @return Matcher */ public function dimension() { @@ -371,9 +427,9 @@ class MatcherFactory { * Matches the number 0 * @return Matcher */ - protected function zero() { + public function zero() { if ( !isset( $this->cache[__METHOD__] ) ) { - $this->cache[__METHOD__] = new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + $this->cache[__METHOD__] = new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { return $t->value() === 0 || $t->value() === 0.0; } ); } @@ -382,16 +438,16 @@ class MatcherFactory { /** * Matcher for a length value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#lengths + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#lengths * @return Matcher */ protected function rawLength() { if ( !isset( $this->cache[__METHOD__] ) ) { - $unitsRe = '/^(' . join( '|', self::$lengthUnits ) . ')$/i'; + $unitsRe = '/^(' . implode( '|', self::$lengthUnits ) . ')$/i'; $this->cache[__METHOD__] = new Alternative( [ $this->zero(), - new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) use ( $unitsRe ) { + new TokenMatcher( Token::T_DIMENSION, static function ( Token $t ) use ( $unitsRe ) { return preg_match( $unitsRe, $t->unit() ); } ), ] ); @@ -401,7 +457,7 @@ class MatcherFactory { /** * Matcher for a length value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#lengths + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#lengths * @return Matcher */ public function length() { @@ -413,26 +469,25 @@ class MatcherFactory { /** * Matcher for an angle value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#angles + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#angles * @return Matcher */ protected function rawAngle() { if ( !isset( $this->cache[__METHOD__] ) ) { - $unitsRe = '/^(' . join( '|', self::$angleUnits ) . ')$/i'; + $unitsRe = '/^(' . implode( '|', self::$angleUnits ) . ')$/i'; - $this->cache[__METHOD__] = new Alternative( [ - $this->zero(), - new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) use ( $unitsRe ) { + $this->cache[__METHOD__] = new TokenMatcher( Token::T_DIMENSION, + static function ( Token $t ) use ( $unitsRe ) { return preg_match( $unitsRe, $t->unit() ); - } ), - ] ); + } + ); } return $this->cache[__METHOD__]; } /** * Matcher for an angle value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#angles + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#angles * @return Matcher */ public function angle() { @@ -444,15 +499,15 @@ class MatcherFactory { /** * Matcher for a duration (time) value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#time + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#time * @return Matcher */ protected function rawTime() { if ( !isset( $this->cache[__METHOD__] ) ) { - $unitsRe = '/^(' . join( '|', self::$timeUnits ) . ')$/i'; + $unitsRe = '/^(' . implode( '|', self::$timeUnits ) . ')$/i'; $this->cache[__METHOD__] = new TokenMatcher( Token::T_DIMENSION, - function ( Token $t ) use ( $unitsRe ) { + static function ( Token $t ) use ( $unitsRe ) { return preg_match( $unitsRe, $t->unit() ); } ); @@ -462,7 +517,7 @@ class MatcherFactory { /** * Matcher for a duration (time) value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#time + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#time * @return Matcher */ public function time() { @@ -474,15 +529,15 @@ class MatcherFactory { /** * Matcher for a frequency value, without calc() - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#frequency + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#frequency * @return Matcher */ protected function rawFrequency() { if ( !isset( $this->cache[__METHOD__] ) ) { - $unitsRe = '/^(' . join( '|', self::$frequencyUnits ) . ')$/i'; + $unitsRe = '/^(' . implode( '|', self::$frequencyUnits ) . ')$/i'; $this->cache[__METHOD__] = new TokenMatcher( Token::T_DIMENSION, - function ( Token $t ) use ( $unitsRe ) { + static function ( Token $t ) use ( $unitsRe ) { return preg_match( $unitsRe, $t->unit() ); } ); @@ -492,7 +547,7 @@ class MatcherFactory { /** * Matcher for a frequency value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#frequency + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#frequency * @return Matcher */ public function frequency() { @@ -504,12 +559,12 @@ class MatcherFactory { /** * Matcher for a resolution value - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#resolution + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#resolution * @return Matcher */ public function resolution() { if ( !isset( $this->cache[__METHOD__] ) ) { - $this->cache[__METHOD__] = new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) { + $this->cache[__METHOD__] = new TokenMatcher( Token::T_DIMENSION, static function ( Token $t ) { return preg_match( '/^(dpi|dpcm|dppx)$/i', $t->unit() ); } ); } @@ -543,7 +598,7 @@ class MatcherFactory { /** * Matcher for a color value - * @see https://www.w3.org/TR/2011/REC-css3-color-20110607/#colorunits + * @see https://www.w3.org/TR/2018/REC-css-color-3-20180619/#colorunits * @return Matcher */ public function color() { @@ -592,7 +647,7 @@ class MatcherFactory { // Other keywords. Intentionally omitting the deprecated system colors. 'transparent', 'currentColor', ] ), - new TokenMatcher( Token::T_HASH, function ( Token $t ) { + new TokenMatcher( Token::T_HASH, static function ( Token $t ) { return preg_match( '/^([0-9a-f]{3}|[0-9a-f]{6})$/i', $t->value() ); } ), ], $this->colorFuncs() ) ); @@ -602,31 +657,33 @@ class MatcherFactory { /** * Matcher for an image value - * @see https://www.w3.org/TR/2012/CR-css3-images-20120417/#image-values + * @see https://www.w3.org/TR/2019/CR-css-images-3-20191010/#image-values * @return Matcher */ public function image() { if ( !isset( $this->cache[__METHOD__] ) ) { - // https://www.w3.org/TR/2012/CR-css3-images-20120417/#image-list-type - // Note the undefined production has been dropped from the Editor's Draft. - $imageDecl = new Alternative( [ - $this->url( 'image' ), - $this->urlstring( 'image' ), - ] ); - - // https://www.w3.org/TR/2012/CR-css3-images-20120417/#gradients + // https://www.w3.org/TR/2019/CR-css-images-3-20191010/#gradients $c = $this->comma(); - $colorStops = Quantifier::hash( new Juxtaposition( [ + $colorStop = UnorderedGroup::allOf( [ $this->color(), - // Not really , but grammatically the same Quantifier::optional( $this->lengthPercentage() ), - ] ), 2, INF ); + ] ); + $colorStopList = new Juxtaposition( [ + $colorStop, + Quantifier::hash( new Juxtaposition( [ + Quantifier::optional( $this->lengthPercentage() ), + $colorStop + ], true ) ), + ], true ); $atPosition = new Juxtaposition( [ new KeywordMatcher( 'at' ), $this->position() ] ); $linearGradient = new Juxtaposition( [ Quantifier::optional( new Juxtaposition( [ new Alternative( [ - $this->angle(), + new Alternative( [ + $this->zero(), + $this->angle(), + ] ), new Juxtaposition( [ new KeywordMatcher( 'to' ), UnorderedGroup::someOf( [ new KeywordMatcher( [ 'left', 'right' ] ), new KeywordMatcher( [ 'top', 'bottom' ] ), @@ -634,7 +691,7 @@ class MatcherFactory { ] ), $c ] ) ), - $colorStops, + $colorStopList, ] ); $radialGradient = new Juxtaposition( [ Quantifier::optional( new Juxtaposition( [ @@ -644,13 +701,12 @@ class MatcherFactory { UnorderedGroup::someOf( [ new KeywordMatcher( 'circle' ), $this->length() ] ), UnorderedGroup::someOf( [ new KeywordMatcher( 'ellipse' ), - // Not really , but grammatically the same Quantifier::count( $this->lengthPercentage(), 2, 2 ) ] ), UnorderedGroup::someOf( [ new KeywordMatcher( [ 'circle', 'ellipse' ] ), new KeywordMatcher( [ - 'closest-side', 'farthest-side', 'closest-corner', 'farthest-corner' + 'closest-corner', 'closest-side', 'farthest-corner', 'farthest-side', ] ), ] ), ] ), @@ -660,16 +716,12 @@ class MatcherFactory { ] ), $c ] ) ), - $colorStops, + $colorStopList, ] ); // Putting it all together $this->cache[__METHOD__] = new Alternative( [ $this->url( 'image' ), - new FunctionMatcher( 'image', new Juxtaposition( [ - Quantifier::star( new Juxtaposition( [ $imageDecl, $c ] ) ), - new Alternative( [ $imageDecl, $this->color() ] ), - ] ) ), new FunctionMatcher( 'linear-gradient', $linearGradient ), new FunctionMatcher( 'radial-gradient', $radialGradient ), new FunctionMatcher( 'repeating-linear-gradient', $linearGradient ), @@ -681,10 +733,41 @@ class MatcherFactory { /** * Matcher for a position value - * @see https://www.w3.org/TR/2014/CR-css3-background-20140909/#ltpositiongt + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#typedef-position * @return Matcher */ public function position() { + if ( !isset( $this->cache[__METHOD__] ) ) { + $lp = $this->lengthPercentage(); + $center = new KeywordMatcher( 'center' ); + $leftRight = new KeywordMatcher( [ 'left', 'right' ] ); + $topBottom = new KeywordMatcher( [ 'top', 'bottom' ] ); + + $this->cache[__METHOD__] = new Alternative( [ + UnorderedGroup::someOf( [ + new Alternative( [ $center, $leftRight ] ), + new Alternative( [ $center, $topBottom ] ), + ] ), + new Juxtaposition( [ + new Alternative( [ $center, $leftRight, $lp ] ), + Quantifier::optional( new Alternative( [ $center, $topBottom, $lp ] ) ), + ] ), + + UnorderedGroup::allOf( [ + new Juxtaposition( [ $leftRight, $lp ] ), + new Juxtaposition( [ $topBottom, $lp ] ), + ] ), + ] ); + } + return $this->cache[__METHOD__]; + } + + /** + * Matcher for a bg-position value + * @see https://www.w3.org/TR/2017/CR-css-backgrounds-3-20171017/#typedef-bg-position + * @return Matcher + */ + public function bgPosition() { if ( !isset( $this->cache[__METHOD__] ) ) { $lp = $this->lengthPercentage(); $olp = Quantifier::optional( $lp ); @@ -709,7 +792,7 @@ class MatcherFactory { /** * Matcher for a CSS media query - * @see https://www.w3.org/TR/2016/WD-mediaqueries-4-20160706/#mq-syntax + * @see https://www.w3.org/TR/2017/CR-mediaqueries-4-20170905/#mq-syntax * @param bool $strict Only allow defined query types * @return Matcher */ @@ -736,10 +819,10 @@ class MatcherFactory { ]; $mfName = new KeywordMatcher( array_merge( $rangeFeatures, - array_map( function ( $f ) { + array_map( static function ( $f ) { return "min-$f"; }, $rangeFeatures ), - array_map( function ( $f ) { + array_map( static function ( $f ) { return "max-$f"; }, $rangeFeatures ), $discreteFeatures @@ -757,7 +840,7 @@ class MatcherFactory { } $posInt = $this->calc( - new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { return $t->typeFlag() === 'integer' && preg_match( '/^\+?\d+$/', $t->representation() ); } ), 'integer' @@ -777,25 +860,38 @@ class MatcherFactory { new Juxtaposition( [ $posInt, new DelimMatcher( '/' ), $posInt ] ), ] ); - $mediaInParens = new NothingMatcher(); // temporary + // temporary + $mediaInParens = new NothingMatcher(); $mediaNot = new Juxtaposition( [ new KeywordMatcher( 'not' ), &$mediaInParens ] ); - $mediaAnd = new Juxtaposition( [ - &$mediaInParens, - Quantifier::plus( new Juxtaposition( [ new KeywordMatcher( 'and' ), &$mediaInParens ] ) ) + $mediaAnd = new Juxtaposition( [ new KeywordMatcher( 'and' ), &$mediaInParens ] ); + $mediaOr = new Juxtaposition( [ new KeywordMatcher( 'or' ), &$mediaInParens ] ); + $mediaCondition = new Alternative( [ + $mediaNot, + new Juxtaposition( [ + &$mediaInParens, + new Alternative( [ + Quantifier::star( $mediaAnd ), + Quantifier::star( $mediaOr ), + ] ) + ] ), ] ); - $mediaOr = new Juxtaposition( [ - &$mediaInParens, - Quantifier::plus( new Juxtaposition( [ new KeywordMatcher( 'or' ), &$mediaInParens ] ) ) + $mediaConditionWithoutOr = new Alternative( [ + $mediaNot, + new Juxtaposition( [ &$mediaInParens, Quantifier::star( $mediaAnd ) ] ), ] ); - $mediaCondition = new Alternative( [ $mediaNot, $mediaAnd, $mediaOr, &$mediaInParens ] ); - $mediaConditionWithoutOr = new Alternative( [ $mediaNot, $mediaAnd, &$mediaInParens ] ); $mediaFeature = new BlockMatcher( Token::T_LEFT_PAREN, new Alternative( [ - new Juxtaposition( [ $mfName, new TokenMatcher( Token::T_COLON ), $mfValue ] ), // - $mfName, // - new Juxtaposition( [ $mfName, $ltgteq, $mfValue ] ), // , 1st alternative - new Juxtaposition( [ $mfValue, $ltgteq, $mfName ] ), // , 2nd alternative - new Juxtaposition( [ $mfValue, $lteq, $mfName, $lteq, $mfValue ] ), // , 3rd alt - new Juxtaposition( [ $mfValue, $gteq, $mfName, $gteq, $mfValue ] ), // , 4th alt + // + new Juxtaposition( [ $mfName, new TokenMatcher( Token::T_COLON ), $mfValue ] ), + // + $mfName, + // , 1st alternative + new Juxtaposition( [ $mfName, $ltgteq, $mfValue ] ), + // , 2nd alternative + new Juxtaposition( [ $mfValue, $ltgteq, $mfName ] ), + // , 3rd alt + new Juxtaposition( [ $mfValue, $lteq, $mfName, $lteq, $mfValue ] ), + // , 4th alt + new Juxtaposition( [ $mfValue, $gteq, $mfName, $gteq, $mfValue ] ), ] ) ); $mediaInParens = new Alternative( [ new BlockMatcher( Token::T_LEFT_PAREN, $mediaCondition ), @@ -821,7 +917,7 @@ class MatcherFactory { /** * Matcher for a CSS media query list - * @see https://www.w3.org/TR/2016/WD-mediaqueries-4-20160706/#mq-syntax + * @see https://www.w3.org/TR/2017/CR-mediaqueries-4-20170905/#mq-syntax * @param bool $strict Only allow defined query types * @return Matcher */ @@ -834,15 +930,114 @@ class MatcherFactory { return $this->cache[$key]; } - /************************************************************************//** + /** + * Matcher for a "supports-condition" + * @see https://www.w3.org/TR/2013/CR-css3-conditional-20130404/#supports_condition + * @param PropertySanitizer|null $declarationSanitizer Check declarations against this Sanitizer + * @param bool $strict Only accept defined syntax. Default true. + * @return Matcher + */ + public function cssSupportsCondition( + PropertySanitizer $declarationSanitizer = null, $strict = true + ) { + $ws = $this->significantWhitespace(); + $anythingPlus = new AnythingMatcher( [ 'quantifier' => '+' ] ); + + if ( $strict ) { + $generalEnclosed = new NothingMatcher(); + } else { + $generalEnclosed = new Alternative( [ + new FunctionMatcher( null, $anythingPlus ), + new BlockMatcher( Token::T_LEFT_PAREN, new Juxtaposition( [ $this->ident(), $anythingPlus ] ) ), + ] ); + } + + // temp + $supportsConditionBlock = new NothingMatcher(); + $supportsConditionInParens = new Alternative( [ + &$supportsConditionBlock, + new BlockMatcher( Token::T_LEFT_PAREN, $this->cssDeclaration( $declarationSanitizer ) ), + $generalEnclosed, + ] ); + $supportsCondition = new Alternative( [ + new Juxtaposition( [ new KeywordMatcher( 'not' ), $ws, $supportsConditionInParens ] ), + new Juxtaposition( [ $supportsConditionInParens, Quantifier::plus( new Juxtaposition( [ + $ws, new KeywordMatcher( 'and' ), $ws, $supportsConditionInParens + ] ) ) ] ), + new Juxtaposition( [ $supportsConditionInParens, Quantifier::plus( new Juxtaposition( [ + $ws, new KeywordMatcher( 'or' ), $ws, $supportsConditionInParens + ] ) ) ] ), + $supportsConditionInParens, + ] ); + $supportsConditionBlock = new BlockMatcher( Token::T_LEFT_PAREN, $supportsCondition ); + + return $supportsCondition; + } + + /** + * Matcher for a declaration + * @param PropertySanitizer|null $declarationSanitizer Check declarations against this Sanitizer + * @return Matcher + */ + public function cssDeclaration( PropertySanitizer $declarationSanitizer = null ) { + $anythingPlus = new AnythingMatcher( [ 'quantifier' => '+' ] ); + + return new CheckedMatcher( + $anythingPlus, + static function ( ComponentValueList $list, GrammarMatch $match, array $options ) + use ( $declarationSanitizer ) + { + $cvlist = new ComponentValueList( $match->getValues() ); + $parser = Parser::newFromTokens( $cvlist->toTokenArray() ); + $declaration = $parser->parseDeclaration(); + if ( !$declaration || $parser->getParseErrors() ) { + return false; + } + if ( !$declarationSanitizer ) { + return true; + } + $reset = $declarationSanitizer->stashSanitizationErrors(); + $ret = $declarationSanitizer->sanitize( $declaration ); + $errors = $declarationSanitizer->getSanitizationErrors(); + unset( $reset ); + return $ret === $declaration && !$errors; + } + ); + } + + /** + * Matcher for single easing functions from CSS Easing Functions Level 1 + * @see https://www.w3.org/TR/2019/CR-css-easing-1-20190430/#typedef-easing-function + * @return Matcher + */ + public function cssSingleEasingFunction() { + if ( !isset( $this->cache[__METHOD__] ) ) { + $this->cache[__METHOD__] = new Alternative( [ + new KeywordMatcher( [ + 'ease', 'linear', 'ease-in', 'ease-out', 'ease-in-out', 'step-start', 'step-end' + ] ), + new FunctionMatcher( 'steps', new Juxtaposition( [ + $this->integer(), + Quantifier::optional( new KeywordMatcher( [ + 'jump-start', 'jump-end', 'jump-none', 'jump-both', 'start', 'end' + ] ) ), + ], true ) ), + new FunctionMatcher( 'cubic-bezier', Quantifier::hash( $this->number(), 4, 4 ) ), + ] ); + } + + return $this->cache[__METHOD__]; + } + + /** * @name CSS Selectors Level 3 * @{ * - * https://www.w3.org/TR/2011/REC-css3-selectors-20110929/#w3cselgrammar + * https://www.w3.org/TR/2018/REC-selectors-3-20181106/#w3cselgrammar */ /** - * List of selectors + * List of selectors (selectors_group) * * selector [ COMMA S* selector ]* * @@ -862,7 +1057,7 @@ class MatcherFactory { } /** - * A single selector + * A single selector (selector) * * simple_selector_sequence [ combinator simple_selector_sequence ]* * @@ -886,7 +1081,7 @@ class MatcherFactory { } /** - * A CSS combinator + * A CSS combinator (combinator) * * PLUS S* | GREATER S* | TILDE S* | S+ * @@ -910,7 +1105,7 @@ class MatcherFactory { } /** - * A simple selector sequence + * A simple selector sequence (simple_selector_sequence) * * [ type_selector | universal ] * [ HASH | class | attrib | pseudo | negation ]* @@ -952,7 +1147,7 @@ class MatcherFactory { } /** - * A type selector (i.e. a tag name) + * A type selector, i.e. a tag name (type_selector) * * [ namespace_prefix ] ? element_name * @@ -974,7 +1169,7 @@ class MatcherFactory { } /** - * A namespace prefix + * A namespace prefix (namespace_prefix) * * [ IDENT | '*' ]? '|' * @@ -1010,7 +1205,7 @@ class MatcherFactory { } /** - * The universal selector + * The universal selector (universal) * * [ namespace_prefix ]? '*' * @@ -1036,7 +1231,7 @@ class MatcherFactory { */ public function cssID() { if ( !isset( $this->cache[__METHOD__] ) ) { - $this->cache[__METHOD__] = new TokenMatcher( Token::T_HASH, function ( Token $t ) { + $this->cache[__METHOD__] = new TokenMatcher( Token::T_HASH, static function ( Token $t ) { return $t->typeFlag() === 'id'; } ); $this->cache[__METHOD__]->setDefaultOptions( [ 'skip-whitespace' => false ] ); @@ -1045,7 +1240,7 @@ class MatcherFactory { } /** - * A class selector + * A class selector (class) * * '.' IDENT * @@ -1063,7 +1258,7 @@ class MatcherFactory { } /** - * An attribute selector + * An attribute selector (attrib) * * '[' S* [ namespace_prefix ]? IDENT S* * [ [ PREFIXMATCH | @@ -1094,14 +1289,12 @@ class MatcherFactory { ] )->capture( 'attribute' ), $this->optionalWhitespace(), Quantifier::optional( new Juxtaposition( [ - Alternative::create( [ - new TokenMatcher( Token::T_PREFIX_MATCH ), - new TokenMatcher( Token::T_SUFFIX_MATCH ), - new TokenMatcher( Token::T_SUBSTRING_MATCH ), + // Sigh. They removed various tokens from CSS Syntax 3, but didn't update the grammar + // in CSS Selectors 3. Wing it with a hint from CSS Selectors 4's + ( new Juxtaposition( [ + Quantifier::optional( new DelimMatcher( [ '^', '$', '*', '~', '|' ] ) ), new DelimMatcher( [ '=' ] ), - new TokenMatcher( Token::T_INCLUDE_MATCH ), - new TokenMatcher( Token::T_DASH_MATCH ), - ] )->capture( 'test' ), + ] ) )->capture( 'test' ), $this->optionalWhitespace(), Alternative::create( [ $this->ident(), @@ -1117,14 +1310,18 @@ class MatcherFactory { } /** - * A pseudo-class or pseudo-element + * A pseudo-class or pseudo-element (pseudo) * * ':' ':'? [ IDENT | functional_pseudo ] * + * Where functional_pseudo is + * + * FUNCTION S* expression ')' + * * Although this actually only matches the pseudo-selectors defined in the * following sources: - * - https://www.w3.org/TR/2011/REC-css3-selectors-20110929/#pseudo-classes - * - https://www.w3.org/TR/2016/WD-css-pseudo-4-20160607/ + * - https://www.w3.org/TR/2018/REC-selectors-3-20181106/#pseudo-classes + * - https://www.w3.org/TR/2019/WD-css-pseudo-4-20190225/ * * @return Matcher */ @@ -1156,7 +1353,7 @@ class MatcherFactory { $colon, new KeywordMatcher( [ 'first-line', 'first-letter', 'before', 'after', 'selection', 'inactive-selection', - 'spelling-error', 'grammar-error', 'placeholder' + 'spelling-error', 'grammar-error', 'marker', 'placeholder' ] ), ] ), ] ); @@ -1168,44 +1365,43 @@ class MatcherFactory { /** * An "AN+B" form * - * https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#anb + * https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#anb-microsyntax * * @return Matcher */ public function cssANplusB() { if ( !isset( $this->cache[__METHOD__] ) ) { // Quoth the spec: - // > The An+B notation was originally defined using a slightly - // > different tokenizer than the rest of CSS, resulting in a - // > somewhat odd definition when expressed in terms of CSS tokens. + // > The An+B notation was originally defined using a slightly + // > different tokenizer than the rest of CSS, resulting in a + // > somewhat odd definition when expressed in terms of CSS tokens. // That's a bit of an understatement - $plus = new DelimMatcher( [ '+' ] ); $plusQ = Quantifier::optional( new DelimMatcher( [ '+' ] ) ); $n = new KeywordMatcher( [ 'n' ] ); $dashN = new KeywordMatcher( [ '-n' ] ); $nDash = new KeywordMatcher( [ 'n-' ] ); $plusQN = new Juxtaposition( [ $plusQ, $n ] ); $plusQNDash = new Juxtaposition( [ $plusQ, $nDash ] ); - $nDimension = new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) { + $nDimension = new TokenMatcher( Token::T_DIMENSION, static function ( Token $t ) { return $t->typeFlag() === 'integer' && !strcasecmp( $t->unit(), 'n' ); } ); - $nDashDimension = new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) { + $nDashDimension = new TokenMatcher( Token::T_DIMENSION, static function ( Token $t ) { return $t->typeFlag() === 'integer' && !strcasecmp( $t->unit(), 'n-' ); } ); - $nDashDigitDimension = new TokenMatcher( Token::T_DIMENSION, function ( Token $t ) { + $nDashDigitDimension = new TokenMatcher( Token::T_DIMENSION, static function ( Token $t ) { return $t->typeFlag() === 'integer' && preg_match( '/^n-\d+$/i', $t->unit() ); } ); - $nDashDigitIdent = new TokenMatcher( Token::T_IDENT, function ( Token $t ) { + $nDashDigitIdent = new TokenMatcher( Token::T_IDENT, static function ( Token $t ) { return preg_match( '/^n-\d+$/i', $t->value() ); } ); - $dashNDashDigitIdent = new TokenMatcher( Token::T_IDENT, function ( Token $t ) { + $dashNDashDigitIdent = new TokenMatcher( Token::T_IDENT, static function ( Token $t ) { return preg_match( '/^-n-\d+$/i', $t->value() ); } ); - $signedInt = new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + $signedInt = new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { return $t->typeFlag() === 'integer' && preg_match( '/^[+-]/', $t->representation() ); } ); - $signlessInt = new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + $signlessInt = new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { return $t->typeFlag() === 'integer' && preg_match( '/^\d/', $t->representation() ); } ); $plusOrMinus = new DelimMatcher( [ '+', '-' ] ); @@ -1213,7 +1409,7 @@ class MatcherFactory { $this->cache[__METHOD__] = new Alternative( [ new KeywordMatcher( [ 'odd', 'even' ] ), - new TokenMatcher( Token::T_NUMBER, function ( Token $t ) { + new TokenMatcher( Token::T_NUMBER, static function ( Token $t ) { return $t->typeFlag() === 'integer'; } ), $nDimension, @@ -1238,7 +1434,7 @@ class MatcherFactory { } /** - * A negation + * A negation (negation) * * ':' not( S* [ type_selector | universal | HASH | class | attrib | pseudo ] S* ')' * @@ -1272,7 +1468,7 @@ class MatcherFactory { return $this->cache[__METHOD__]; } - /**@}*/ + /** @} */ } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NoWhitespace.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NoWhitespace.php index 92f90c7f5..1e3699427 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NoWhitespace.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NoWhitespace.php @@ -14,8 +14,9 @@ use Wikimedia\CSS\Objects\Token; */ class NoWhitespace extends Matcher { + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start-1] ) ? $values[$start-1] : null; + $cv = $values[$start - 1] ?? null; if ( !$cv instanceof Token || $cv->type() !== Token::T_WHITESPACE ) { yield $this->makeMatch( $values, $start, $start ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NonEmpty.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NonEmpty.php index b8810b350..6b7b17600 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NonEmpty.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NonEmpty.php @@ -9,8 +9,8 @@ namespace Wikimedia\CSS\Grammar; use Wikimedia\CSS\Objects\ComponentValueList; /** - * Matcher that requires its sub-Matcher has only non-empty matches ("!" multipier) - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-req + * Matcher that requires its sub-Matcher has only non-empty matches ("!" multiplier) + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-req */ class NonEmpty extends Matcher { /** @var Matcher */ @@ -23,6 +23,7 @@ class NonEmpty extends Matcher { $this->matcher = $matcher; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { foreach ( $this->matcher->generateMatches( $values, $start, $options ) as $match ) { if ( $match->getLength() !== 0 ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NothingMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NothingMatcher.php index 246e1ed04..100a8c868 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/NothingMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/NothingMatcher.php @@ -6,14 +6,15 @@ namespace Wikimedia\CSS\Grammar; -use Wikimedia\CSS\Objects\ComponentValue; +use EmptyIterator; use Wikimedia\CSS\Objects\ComponentValueList; /** * Matcher that matches nothing */ class NothingMatcher extends Matcher { + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - return new \EmptyIterator; + return new EmptyIterator; } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Quantifier.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Quantifier.php index 06c874014..d0bb5a6e0 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/Quantifier.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/Quantifier.php @@ -6,20 +6,25 @@ namespace Wikimedia\CSS\Grammar; +use Iterator; +use UnexpectedValueException; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\Token; /** * Matcher that matches a sub-Matcher a certain number of times * ("?", "*", "+", "#", "{A,B}" multipliers) - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-multipliers + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-multipliers */ class Quantifier extends Matcher { /** @var Matcher */ protected $matcher; /** @var int */ - protected $min, $max; + protected $min; + + /** @var int */ + protected $max; /** @var bool Whether matches are comma-separated */ protected $commas; @@ -39,7 +44,7 @@ class Quantifier extends Matcher { /** * Implements "?": 0 or 1 matches - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-opt + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-opt * @param Matcher $matcher * @return static */ @@ -49,7 +54,7 @@ class Quantifier extends Matcher { /** * Implements "*": 0 or more matches - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-zero-plus + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-zero-plus * @param Matcher $matcher * @return static */ @@ -59,7 +64,7 @@ class Quantifier extends Matcher { /** * Implements "+": 1 or more matches - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-one-plus + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-one-plus * @param Matcher $matcher * @return static */ @@ -69,7 +74,7 @@ class Quantifier extends Matcher { /** * Implements "{A,B}": Between A and B matches - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-num-range + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-num-range * @param Matcher $matcher * @param int|float $min Minimum number of matches * @param int|float $max Maximum number of matches @@ -81,7 +86,7 @@ class Quantifier extends Matcher { /** * Implements "#" and "#{A,B}": Between A and B matches, comma-separated - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#mult-comma + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#mult-comma * @param Matcher $matcher * @param int|float $min Minimum number of matches * @param int|float $max Maximum number of matches @@ -91,17 +96,21 @@ class Quantifier extends Matcher { return new static( $matcher, $min, $max, true ); } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $used = []; // Maintain a stack of matches for backtracking purposes. $stack = [ - [ new Match( $values, $start, 0 ), $this->matcher->generateMatches( $values, $start, $options ) ] + [ + new GrammarMatch( $values, $start, 0 ), + $this->matcher->generateMatches( $values, $start, $options ) + ] ]; do { - /** @var $lastMatch Match */ - /** @var $iter \Iterator */ - list( $lastMatch, $iter ) = $stack[count( $stack ) - 1]; + /** @var $lastMatch GrammarMatch */ + /** @var $iter Iterator */ + [ $lastMatch, $iter ] = $stack[count( $stack ) - 1]; // If the top of the stack has no more matches, pop it, maybe // yield the last matched position, and loop. @@ -126,7 +135,7 @@ class Quantifier extends Matcher { // Quantifiers don't work well when the quantified thing can be empty. if ( $match->getLength() === 0 ) { - throw new \UnexpectedValueException( 'Empty match in quantifier!' ); + throw new UnexpectedValueException( 'Empty match in quantifier!' ); } $nextFrom = $match->getNext(); @@ -136,17 +145,19 @@ class Quantifier extends Matcher { $canBeMore = count( $stack ) < $this->max; // Commas are slightly tricky: - // 1. If there is a following comma, start the next Matcher after it. - // 2. If not, there can't be any more Matchers following. + // 1. If there is a following comma, start the next Matcher after it. + // 2. If not, there can't be any more Matchers following. // And in either case optional whitespace is always allowed. if ( $this->commas ) { $n = $nextFrom; if ( isset( $values[$n] ) && $values[$n] instanceof Token && + // @phan-suppress-next-line PhanNonClassMethodCall False positive $values[$n]->type() === Token::T_WHITESPACE ) { $n = $this->next( $values, $n, [ 'skip-whitespace' => true ] + $options ); } if ( isset( $values[$n] ) && $values[$n] instanceof Token && + // @phan-suppress-next-line PhanNonClassMethodCall False positive $values[$n]->type() === Token::T_COMMA ) { $nextFrom = $this->next( $values, $n, [ 'skip-whitespace' => true ] + $options ); @@ -156,7 +167,7 @@ class Quantifier extends Matcher { } // If there can be more matches, push another one onto the stack - // and try it. Otherwise yield and continue with the current match. + // and try it. Otherwise, yield and continue with the current match. if ( $canBeMore ) { $stack[] = [ $match, $this->matcher->generateMatches( $values, $nextFrom, $options ) ]; } else { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/TokenMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/TokenMatcher.php index 092221875..45a04d729 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/TokenMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/TokenMatcher.php @@ -11,7 +11,7 @@ use Wikimedia\CSS\Objects\Token; /** * Matcher that matches a token of a particular type - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-types + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-types */ class TokenMatcher extends Matcher { /** @var string One of the Token::T_* constants */ @@ -30,8 +30,9 @@ class TokenMatcher extends Matcher { $this->callback = $callback; } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof Token && $cv->type() === $this->type && ( !$this->callback || call_user_func( $this->callback, $cv ) ) ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/UnorderedGroup.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UnorderedGroup.php index 9152e906d..98386bb6a 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/UnorderedGroup.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UnorderedGroup.php @@ -6,12 +6,15 @@ namespace Wikimedia\CSS\Grammar; +use ArrayIterator; +use EmptyIterator; +use Iterator; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Util; /** * Matcher that groups other matchers without ordering ("&&" and "||" combiners) - * @see https://www.w3.org/TR/2016/CR-css-values-3-20160929/#component-combinators + * @see https://www.w3.org/TR/2019/CR-css-values-3-20190606/#component-combinators */ class UnorderedGroup extends Matcher { /** @var Matcher[] */ @@ -48,6 +51,7 @@ class UnorderedGroup extends Matcher { return new static( $matchers, false ); } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $used = []; @@ -55,20 +59,20 @@ class UnorderedGroup extends Matcher { // of remaining matchers. $stack = [ [ - new Match( $values, $start, 0 ), + new GrammarMatch( $values, $start, 0 ), $this->matchers, - new \ArrayIterator( $this->matchers ), + new ArrayIterator( $this->matchers ), null, - new \EmptyIterator + new EmptyIterator ] ]; do { - /** @var $lastMatch Match */ + /** @var $lastMatch GrammarMatch */ /** @var $matchers Matcher[] */ - /** @var $matcherIter \Iterator */ + /** @var $matcherIter Iterator */ /** @var $curMatcher Matcher|null */ - /** @var $iter \Iterator */ - list( $lastMatch, $matchers, $matcherIter, $curMatcher, $iter ) = $stack[count( $stack ) - 1]; + /** @var $iter Iterator */ + [ $lastMatch, $matchers, $matcherIter, $curMatcher, $iter ] = $stack[count( $stack ) - 1]; // If the top of the stack has more matches, process the next one. if ( $iter->valid() ) { @@ -76,9 +80,9 @@ class UnorderedGroup extends Matcher { $iter->next(); // If we have unused matchers to try after this one, do so. - // Otherwise yield and continue with the current one. + // Otherwise, yield and continue with the current one. if ( $matchers ) { - $stack[] = [ $match, $matchers, new \ArrayIterator( $matchers ), null, new \EmptyIterator ]; + $stack[] = [ $match, $matchers, new ArrayIterator( $matchers ), null, new EmptyIterator ]; } else { $newMatch = $this->makeMatch( $values, $start, $match->getNext(), $match, $stack ); $mid = $newMatch->getUniqueID(); @@ -91,7 +95,7 @@ class UnorderedGroup extends Matcher { } // We ran out of matches for the current top of the stack. Pop it, - // and put $curMatcher back into $matchers so it can be tried again + // and put $curMatcher back into $matchers, so it can be tried again // at a later position. array_pop( $stack ); if ( $curMatcher ) { @@ -109,14 +113,12 @@ class UnorderedGroup extends Matcher { unset( $matchers[$matcherIter->key()] ); $iter = $curMatcher->generateMatches( $values, $fromPos, $options ); $stack[] = [ $lastMatch, $matchers, $matcherIter, $curMatcher, $iter ]; - } else { - if ( $stack && !$this->all ) { - $newMatch = $this->makeMatch( $values, $start, $fromPos, $lastMatch, $stack ); - $mid = $newMatch->getUniqueID(); - if ( !isset( $used[$mid] ) ) { - $used[$mid] = 1; - yield $newMatch; - } + } elseif ( $stack && !$this->all ) { + $newMatch = $this->makeMatch( $values, $start, $fromPos, $lastMatch, $stack ); + $mid = $newMatch->getUniqueID(); + if ( !isset( $used[$mid] ) ) { + $used[$mid] = 1; + yield $newMatch; } } } while ( $stack ); diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrangeMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrangeMatcher.php new file mode 100644 index 000000000..ce6754ea5 --- /dev/null +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrangeMatcher.php @@ -0,0 +1,98 @@ +" notation + * + * If this matcher is marked for capturing, its matches will have submatches + * "start" and "end" holding T_NUMBER tokens representing the starting and + * ending codepoints in the range. + * + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#urange + */ +class UrangeMatcher extends Matcher { + /** @var Matcher Syntax matcher */ + private $matcher; + + public function __construct() { + $u = new KeywordMatcher( [ 'u' ] ); + $plus = new DelimMatcher( [ '+' ] ); + $ident = new TokenMatcher( Token::T_IDENT ); + $number = new TokenMatcher( Token::T_NUMBER ); + $dimension = new TokenMatcher( Token::T_DIMENSION ); + $q = new DelimMatcher( [ '?' ] ); + $qs = Quantifier::count( $q, 0, 6 ); + + // This matches a lot of things; we post-process in generateMatches() to limit it to + // only what's actually supposed to be accepted. + $this->matcher = new Alternative( [ + new Juxtaposition( [ $u, $plus, $ident, $qs ] ), + new Juxtaposition( [ $u, $number, $dimension ] ), + new Juxtaposition( [ $u, $number, $number ] ), + new Juxtaposition( [ $u, $dimension, $qs ] ), + new Juxtaposition( [ $u, $number, $qs ] ), + new Juxtaposition( [ $u, $plus, Quantifier::count( $q, 1, 6 ) ] ), + ] ); + } + + /** @inheritDoc */ + protected function generateMatches( ComponentValueList $values, $start, array $options ) { + foreach ( $this->matcher->generateMatches( $values, $start, $options ) as $match ) { + // is basically defined as a series of tokens that happens to have a certain string + // representation. So stringify and regex it to see if it actually matches. + $v = trim( $match->__toString(), "\n\t " ); + // Strip interpolated comments + $v = strtr( $v, [ '/**/' => '' ] ); + $l = strlen( $v ); + if ( preg_match( '/^u\+([0-9a-f]{1,6})-([0-9a-f]{1,6})$/iD', $v, $m ) ) { + $ustart = intval( $m[1], 16 ); + $uend = intval( $m[2], 16 ); + } elseif ( $l > 2 && $l <= 8 && preg_match( '/^u\+([0-9a-f]*\?*)$/iD', $v, $m ) ) { + $ustart = intval( strtr( $m[1], [ '?' => '0' ] ), 16 ); + $uend = intval( strtr( $m[1], [ '?' => 'f' ] ), 16 ); + } else { + continue; + } + if ( $ustart >= 0 && $ustart <= $uend && $uend <= 0x10ffff ) { + $len = $match->getNext() - $start; + $matches = []; + if ( $this->captureName !== null ) { + $tstart = new Token( Token::T_NUMBER, [ 'value' => $ustart, 'typeFlag' => 'integer' ] ); + $tend = new Token( Token::T_NUMBER, [ 'value' => $uend, 'typeFlag' => 'integer' ] ); + $matches = [ + new GrammarMatch( + new ComponentValueList( $tstart->toComponentValueArray() ), + 0, + 1, + 'start', + [] + ), + new GrammarMatch( + new ComponentValueList( $tend->toComponentValueArray() ), + 0, + 1, + 'end', + [] + ), + ]; + } + + // Mark the 'U' T_IDENT beginning a , to later avoid + // serializing it with extraneous comments. + // @see Wikimedia\CSS\Util::stringify() + // @phan-suppress-next-line PhanNonClassMethodCall False positive + $values[$start]->urangeHack( $len ); + + yield new GrammarMatch( $values, $start, $len, $this->captureName, $matches ); + } + } + } +} diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrlMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrlMatcher.php index d427ef500..5a515dc9c 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrlMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/UrlMatcher.php @@ -6,12 +6,13 @@ namespace Wikimedia\CSS\Grammar; +use InvalidArgumentException; use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\CSSFunction; use Wikimedia\CSS\Objects\Token; /** - * Matcher that matches a CSSFunction for a url or a T_URL token + * Matcher that matches a CSSFunction for a URL or a T_URL token */ class UrlMatcher extends FunctionMatcher { /** @var callable|null */ @@ -28,7 +29,7 @@ class UrlMatcher extends FunctionMatcher { if ( isset( $options['modifierMatcher'] ) ) { $modifierMatcher = $options['modifierMatcher']; if ( !$modifierMatcher instanceof Matcher ) { - throw new \InvalidArgumentException( 'modifierMatcher must be a Matcher' ); + throw new InvalidArgumentException( 'modifierMatcher must be a Matcher' ); } } else { $modifierMatcher = new NothingMatcher; @@ -54,13 +55,14 @@ class UrlMatcher extends FunctionMatcher { ] ); } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { // First, is it a URL token? - $cv = isset( $values[$start] ) ? $values[$start] : null; + $cv = $values[$start] ?? null; if ( $cv instanceof Token && $cv->type() === Token::T_URL ) { $url = $cv->value(); if ( !$this->urlCheck || call_user_func( $this->urlCheck, $url, [] ) ) { - $match = new Match( $values, $start, 1, 'url' ); + $match = new GrammarMatch( $values, $start, 1, 'url' ); yield $this->makeMatch( $values, $start, $this->next( $values, $start, $options ), $match ); } return; @@ -73,12 +75,12 @@ class UrlMatcher extends FunctionMatcher { $modifiers = []; foreach ( $match->getCapturedMatches() as $submatch ) { $cvs = $submatch->getValues(); - if ( $submatch->getName() === 'url' ) { + if ( $cvs[0] instanceof Token && $submatch->getName() === 'url' ) { $url = $cvs[0]->value(); } elseif ( $submatch->getName() === 'modifier' ) { if ( $cvs[0] instanceof CSSFunction ) { $modifiers[] = $cvs[0]; - } elseif ( $cvs[0]->type() === Token::T_IDENT ) { + } elseif ( $cvs[0] instanceof Token && $cvs[0]->type() === Token::T_IDENT ) { $modifiers[] = $cvs[0]; } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Grammar/WhitespaceMatcher.php b/lib/css-sanitizer/Wikimedia/CSS/Grammar/WhitespaceMatcher.php index 9aa074a4d..90b5f6955 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Grammar/WhitespaceMatcher.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Grammar/WhitespaceMatcher.php @@ -26,9 +26,11 @@ class WhitespaceMatcher extends Matcher { $this->significant = !empty( $options['significant'] ); } + /** @inheritDoc */ protected function generateMatches( ComponentValueList $values, $start, array $options ) { $end = $start; while ( isset( $values[$end] ) && + // @phan-suppress-next-line PhanNonClassMethodCall False positive $values[$end] instanceof Token && $values[$end]->type() === Token::T_WHITESPACE ) { $end++; @@ -46,6 +48,7 @@ class WhitespaceMatcher extends Matcher { if ( $end === $start ) { $start--; if ( !$options['skip-whitespace'] || !isset( $values[$start] ) || + // @phan-suppress-next-line PhanNonClassMethodCall False positive !$values[$start] instanceof Token || $values[$start]->type() !== Token::T_WHITESPACE ) { return; @@ -54,7 +57,7 @@ class WhitespaceMatcher extends Matcher { // Return the match. Include a 'significantWhitespace' capture. yield $this->makeMatch( $values, $start, $end, - new Match( $values, $start, 1, 'significantWhitespace' ) + new GrammarMatch( $values, $start, 1, 'significantWhitespace' ) ); } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/AtRule.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/AtRule.php index 1dfc26e7c..cdb4a2b83 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/AtRule.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/AtRule.php @@ -6,6 +6,7 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; use Wikimedia\CSS\Util; /** @@ -27,7 +28,7 @@ class AtRule extends Rule implements DeclarationOrAtRule { */ public function __construct( Token $token ) { if ( $token->type() !== Token::T_AT_KEYWORD ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "At rule must begin with an at-keyword token, got {$token->type()}" ); } @@ -38,9 +39,9 @@ class AtRule extends Rule implements DeclarationOrAtRule { } public function __clone() { - $this->prelude = clone( $this->prelude ); + $this->prelude = clone $this->prelude; if ( $this->block ) { - $this->block = clone( $this->block ); + $this->block = clone $this->block; } } @@ -83,13 +84,14 @@ class AtRule extends Rule implements DeclarationOrAtRule { */ public function setBlock( SimpleBlock $block = null ) { if ( $block->getStartTokenType() !== Token::T_LEFT_BRACE ) { - throw new \InvalidArgumentException( 'At-rule block must be delimited by {}' ); + throw new InvalidArgumentException( 'At-rule block must be delimited by {}' ); } $this->block = $block; } /** * @param string $function Function to call, toTokenArray() or toComponentValueArray() + * @return Token[]|ComponentValue[] */ private function toTokenOrCVArray( $function ) { $ret = []; @@ -97,7 +99,7 @@ class AtRule extends Rule implements DeclarationOrAtRule { $ret[] = new Token( Token::T_AT_KEYWORD, [ 'value' => $this->name, 'position' => [ $this->line, $this->pos ] ] ); - // Manually looping and appending turns out to be noticably faster than array_merge. + // Manually looping and appending turns out to be noticeably faster than array_merge. foreach ( $this->prelude->$function() as $v ) { $ret[] = $v; } @@ -112,10 +114,12 @@ class AtRule extends Rule implements DeclarationOrAtRule { return $ret; } + /** @inheritDoc */ public function toTokenArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } + /** @inheritDoc */ public function toComponentValueArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSFunction.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSFunction.php index ee03edf8c..a16d2c866 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSFunction.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSFunction.php @@ -6,6 +6,7 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; use Wikimedia\CSS\Util; /** @@ -24,18 +25,18 @@ class CSSFunction extends ComponentValue { */ public function __construct( Token $token ) { if ( $token->type() !== Token::T_FUNCTION ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "CSS function must begin with a function token, got {$token->type()}" ); } - list( $this->line, $this->pos ) = $token->getPosition(); + [ $this->line, $this->pos ] = $token->getPosition(); $this->name = $token->value(); $this->value = new ComponentValueList(); } public function __clone() { - $this->value = clone( $this->value ); + $this->value = clone $this->value; } /** @@ -48,7 +49,7 @@ class CSSFunction extends ComponentValue { } /** - * Return the functions's name + * Return the function's name * @return string */ public function getName() { @@ -74,7 +75,7 @@ class CSSFunction extends ComponentValue { Token::T_FUNCTION, [ 'value' => $this->name, 'position' => [ $this->line, $this->pos ] ] ); - // Manually looping and appending turns out to be noticably faster than array_merge. + // Manually looping and appending turns out to be noticeably faster than array_merge. foreach ( $this->value->toTokenArray() as $v ) { $ret[] = $v; } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSObjectList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSObjectList.php index 42ecac53b..534f308b2 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSObjectList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/CSSObjectList.php @@ -6,12 +6,17 @@ namespace Wikimedia\CSS\Objects; +use ArrayAccess; +use Countable; +use InvalidArgumentException; +use OutOfBoundsException; +use SeekableIterator; use Wikimedia\CSS\Util; /** * Represent a list of CSS objects */ -class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSObject { +class CSSObjectList implements Countable, SeekableIterator, ArrayAccess, CSSObject { /** @var string The specific class of object contained */ protected static $objectType; @@ -41,7 +46,7 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO /** * Insert one or more objects into the list * @param CSSObject|CSSObject[]|CSSObjectList $objects An object to add, or an array of objects. - * @param int $index Insert the objects at this index. If omitted, the + * @param int|null $index Insert the objects at this index. If omitted, the * objects are added at the end. */ public function add( $objects, $index = null ) { @@ -53,7 +58,7 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO static::testObjects( $objects ); } else { if ( !$objects instanceof static::$objectType ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( static::class . ' may only contain instances of ' . static::$objectType . '.' ); } @@ -64,7 +69,7 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO if ( $index === null ) { $index = count( $this->objects ); } elseif ( $index < 0 || $index > count( $this->objects ) ) { - throw new \OutOfBoundsException( 'Index is out of range.' ); + throw new OutOfBoundsException( 'Index is out of range.' ); } array_splice( $this->objects, $index, 0, $objects ); @@ -80,7 +85,7 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO */ public function remove( $index ) { if ( $index < 0 || $index >= count( $this->objects ) ) { - throw new \OutOfBoundsException( 'Index is out of range.' ); + throw new OutOfBoundsException( 'Index is out of range.' ); } $ret = $this->objects[$index]; array_splice( $this->objects, $index, 1 ); @@ -111,88 +116,101 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO $this->offset = 0; } - // \Countable interface + // Countable interface - public function count() { + /** @inheritDoc */ + public function count(): int { return count( $this->objects ); } - // \SeekableIterator interface + // SeekableIterator interface - public function seek( $offset ) { + /** @inheritDoc */ + public function seek( int $offset ): void { if ( $offset < 0 || $offset >= count( $this->objects ) ) { - throw new \OutOfBoundsException( 'Offset is out of range.' ); + throw new OutOfBoundsException( 'Offset is out of range.' ); } $this->offset = $offset; } + /** @inheritDoc */ + #[\ReturnTypeWillChange] public function current() { - return isset( $this->objects[$this->offset] ) ? $this->objects[$this->offset] : null; + return $this->objects[$this->offset] ?? null; } - public function key() { + /** @inheritDoc */ + public function key(): int { return $this->offset; } - public function next() { + /** @inheritDoc */ + public function next(): void { $this->offset++; } - public function rewind() { + /** @inheritDoc */ + public function rewind(): void { $this->offset = 0; } - public function valid() { + /** @inheritDoc */ + public function valid(): bool { return isset( $this->objects[$this->offset] ); } - // \ArrayAccess interface + // ArrayAccess interface - public function offsetExists( $offset ) { + /** @inheritDoc */ + public function offsetExists( $offset ): bool { return isset( $this->objects[$offset] ); } - public function offsetGet( $offset ) { + /** @inheritDoc */ + public function offsetGet( $offset ): CSSObject { if ( !is_numeric( $offset ) || (float)(int)$offset !== (float)$offset ) { - throw new \InvalidArgumentException( 'Offset must be an integer.' ); + throw new InvalidArgumentException( 'Offset must be an integer.' ); } if ( $offset < 0 || $offset > count( $this->objects ) ) { - throw new \OutOfBoundsException( 'Offset is out of range.' ); + throw new OutOfBoundsException( 'Offset is out of range.' ); } return $this->objects[$offset]; } - public function offsetSet( $offset, $value ) { + /** @inheritDoc */ + public function offsetSet( $offset, $value ): void { if ( !$value instanceof static::$objectType ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( static::class . ' may only contain instances of ' . static::$objectType . '.' ); } static::testObjects( [ $value ] ); if ( !is_numeric( $offset ) || (float)(int)$offset !== (float)$offset ) { - throw new \InvalidArgumentException( 'Offset must be an integer.' ); + throw new InvalidArgumentException( 'Offset must be an integer.' ); } if ( $offset < 0 || $offset > count( $this->objects ) ) { - throw new \OutOfBoundsException( 'Offset is out of range.' ); + throw new OutOfBoundsException( 'Offset is out of range.' ); } $this->objects[$offset] = $value; } - public function offsetUnset( $offset ) { + /** @inheritDoc */ + public function offsetUnset( $offset ): void { if ( isset( $this->objects[$offset] ) && $offset !== count( $this->objects ) - 1 ) { - throw new \OutOfBoundsException( 'Cannot leave holes in the list.' ); + throw new OutOfBoundsException( 'Cannot leave holes in the list.' ); } unset( $this->objects[$offset] ); } // CSSObject interface + /** @inheritDoc */ public function getPosition() { $ret = null; foreach ( $this->objects as $obj ) { $pos = $obj->getPosition(); if ( $pos[0] >= 0 && ( - !$ret || $pos[0] < $ret[0] || $pos[0] === $ret[0] && $pos[1] < $ret[1] + !$ret || $pos[0] < $ret[0] || ( $pos[0] === $ret[0] && $pos[1] < $ret[1] ) ) ) { $ret = $pos; } @@ -212,27 +230,31 @@ class CSSObjectList implements \Countable, \SeekableIterator, \ArrayAccess, CSSO /** * @param string $function Function to call, toTokenArray() or toComponentValueArray() + * @return Token[]|ComponentValue[] */ private function toTokenOrCVArray( $function ) { $ret = []; $l = count( $this->objects ); - for ( $i = 0; $i < $l; $i++ ) { - // Manually looping and appending turns out to be noticably faster than array_merge. - foreach ( $this->objects[$i]->$function() as $v ) { + foreach ( $this->objects as $i => $iValue ) { + // Manually looping and appending turns out to be noticeably faster than array_merge. + foreach ( $iValue->$function() as $v ) { $ret[] = $v; } - $sep = $this->getSeparator( $this->objects[$i], $i + 1 < $l ? $this->objects[$i + 1] : null ); + $sep = $this->getSeparator( $iValue, $i + 1 < $l ? $this->objects[$i + 1] : null ); foreach ( $sep as $v ) { $ret[] = $v; } } + return $ret; } + /** @inheritDoc */ public function toTokenArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } + /** @inheritDoc */ public function toComponentValueArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValue.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValue.php index d22f520a1..9563c9c03 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValue.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValue.php @@ -11,8 +11,11 @@ namespace Wikimedia\CSS\Objects; */ abstract class ComponentValue implements CSSObject { - /** @var int Line and position in the input where this component value starts */ - protected $line = -1, $pos = -1; + /** @var int Line in the input where this component value starts */ + protected $line = -1; + + /** @var int Position in the input where this component value starts */ + protected $pos = -1; /** * Get the position of this ComponentValue in the input stream @@ -22,6 +25,7 @@ abstract class ComponentValue implements CSSObject { return [ $this->line, $this->pos ]; } + /** @inheritDoc */ public function toComponentValueArray() { return [ $this ]; } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValueList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValueList.php index b149eafaa..540c399f3 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValueList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/ComponentValueList.php @@ -6,12 +6,18 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; + /** * Represent a list of CSS declarations */ class ComponentValueList extends CSSObjectList { + /** + * @var string + */ protected static $objectType = ComponentValue::class; + /** @inheritDoc */ protected static function testObjects( array $objects ) { foreach ( $objects as $object ) { $type = $object instanceof Token ? $object->type() : 'n/a'; @@ -20,15 +26,16 @@ class ComponentValueList extends CSSObjectList { case Token::T_LEFT_BRACKET: case Token::T_LEFT_PAREN: case Token::T_LEFT_BRACE: - throw new \InvalidArgumentException( + throw new InvalidArgumentException( static::class . " may not contain tokens of type \"$type\"." ); } } } - // Much simpler + /** @inheritDoc */ public function toComponentValueArray() { + // Much simpler return $this->objects; } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/Declaration.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/Declaration.php index bdee3036f..f7d2c9d2f 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/Declaration.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/Declaration.php @@ -6,6 +6,7 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; use Wikimedia\CSS\Util; /** @@ -13,8 +14,11 @@ use Wikimedia\CSS\Util; */ class Declaration implements DeclarationOrAtRule { - /** @var int Line and position in the input where this declaration starts */ - protected $line = -1, $pos = -1; + /** @var int Line in the input where this declaration starts */ + protected $line = -1; + + /** @var int Position in the input where this declaration starts */ + protected $pos = -1; /** @var string */ protected $name; @@ -30,18 +34,18 @@ class Declaration implements DeclarationOrAtRule { */ public function __construct( Token $token ) { if ( $token->type() !== Token::T_IDENT ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "Declaration must begin with an ident token, got {$token->type()}" ); } - list( $this->line, $this->pos ) = $token->getPosition(); + [ $this->line, $this->pos ] = $token->getPosition(); $this->name = $token->value(); $this->value = new ComponentValueList(); } public function __clone() { - $this->value = clone( $this->value ); + $this->value = clone $this->value; } /** @@ -86,6 +90,7 @@ class Declaration implements DeclarationOrAtRule { /** * @param string $function Function to call, toTokenArray() or toComponentValueArray() + * @return Token[]|ComponentValue[] */ private function toTokenOrCVArray( $function ) { $ret = []; @@ -95,7 +100,7 @@ class Declaration implements DeclarationOrAtRule { [ 'value' => $this->name, 'position' => [ $this->line, $this->pos ] ] ); $ret[] = $v = new Token( Token::T_COLON ); - // Manually looping and appending turns out to be noticably faster than array_merge. + // Manually looping and appending turns out to be noticeably faster than array_merge. foreach ( $this->value->$function() as $v ) { $ret[] = $v; } @@ -109,10 +114,12 @@ class Declaration implements DeclarationOrAtRule { return $ret; } + /** @inheritDoc */ public function toTokenArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } + /** @inheritDoc */ public function toComponentValueArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationList.php index 123809286..15211f934 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationList.php @@ -10,16 +10,20 @@ namespace Wikimedia\CSS\Objects; * Represent a list of declarations */ class DeclarationList extends CSSObjectList { + /** + * @var string + */ protected static $objectType = Declaration::class; + /** @inheritDoc */ protected function getSeparator( CSSObject $left, CSSObject $right = null ) { if ( $right ) { return [ new Token( Token::T_SEMICOLON ), new Token( Token::T_WHITESPACE, [ 'significant' => false ] ), ]; - } else { - return [ new Token( Token::T_SEMICOLON, [ 'significant' => false ] ) ]; } + + return [ new Token( Token::T_SEMICOLON, [ 'significant' => false ] ) ]; } } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationOrAtRuleList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationOrAtRuleList.php index 51b503ca2..214727cf9 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationOrAtRuleList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/DeclarationOrAtRuleList.php @@ -10,8 +10,12 @@ namespace Wikimedia\CSS\Objects; * Represent a list of CSS declarations and at-rules */ class DeclarationOrAtRuleList extends CSSObjectList { + /** + * @var string + */ protected static $objectType = DeclarationOrAtRule::class; + /** @inheritDoc */ protected function getSeparator( CSSObject $left, CSSObject $right = null ) { $ret = []; if ( $left instanceof Declaration ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/QualifiedRule.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/QualifiedRule.php index d6e708aef..7e2a1a555 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/QualifiedRule.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/QualifiedRule.php @@ -6,6 +6,7 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; use Wikimedia\CSS\Util; /** @@ -19,6 +20,7 @@ class QualifiedRule extends Rule { /** @var SimpleBlock */ protected $block; + /** @inheritDoc */ public function __construct( Token $token = null ) { parent::__construct( $token ?: new Token( Token::T_EOF ) ); $this->prelude = new ComponentValueList(); @@ -26,8 +28,8 @@ class QualifiedRule extends Rule { } public function __clone() { - $this->prelude = clone( $this->prelude ); - $this->block = clone( $this->block ); + $this->prelude = clone $this->prelude; + $this->block = clone $this->block; } /** @@ -48,22 +50,23 @@ class QualifiedRule extends Rule { /** * Set the block - * @param SimpleBlock $block + * @param SimpleBlock|null $block */ public function setBlock( SimpleBlock $block = null ) { if ( $block->getStartTokenType() !== Token::T_LEFT_BRACE ) { - throw new \InvalidArgumentException( 'Qualified rule block must be delimited by {}' ); + throw new InvalidArgumentException( 'Qualified rule block must be delimited by {}' ); } $this->block = $block; } /** * @param string $function Function to call, toTokenArray() or toComponentValueArray() + * @return Token[]|ComponentValue[] */ private function toTokenOrCVArray( $function ) { $ret = []; - // Manually looping and appending turns out to be noticably faster than array_merge. + // Manually looping and appending turns out to be noticeably faster than array_merge. foreach ( $this->prelude->$function() as $v ) { $ret[] = $v; } @@ -73,10 +76,12 @@ class QualifiedRule extends Rule { return $ret; } + /** @inheritDoc */ public function toTokenArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } + /** @inheritDoc */ public function toComponentValueArray() { return $this->toTokenOrCVArray( __FUNCTION__ ); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/Rule.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/Rule.php index cffe8cc0f..7502b988c 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/Rule.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/Rule.php @@ -6,21 +6,22 @@ namespace Wikimedia\CSS\Objects; -use Wikimedia\CSS\Util; - /** * Represent an abstract CSS rule */ abstract class Rule implements CSSObject { - /** @var int Line and position in the input where this rule starts */ - protected $line = -1, $pos = -1; + /** @var int Line in the input where this rule starts */ + protected $line = -1; + + /** @var int Position in the input where this rule starts */ + protected $pos = -1; /** * @param Token $token Token starting the rule */ public function __construct( Token $token ) { - list( $this->line, $this->pos ) = $token->getPosition(); + [ $this->line, $this->pos ] = $token->getPosition(); } /** diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/RuleList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/RuleList.php index 6e2fa2ce4..9ae078cfb 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/RuleList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/RuleList.php @@ -10,8 +10,12 @@ namespace Wikimedia\CSS\Objects; * Represent a list of CSS rules */ class RuleList extends CSSObjectList { + /** + * @var string + */ protected static $objectType = Rule::class; + /** @inheritDoc */ protected function getSeparator( CSSObject $left, CSSObject $right = null ) { return $right ? [ new Token( Token::T_WHITESPACE, [ 'significant' => false ] ) ] : []; } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/SimpleBlock.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/SimpleBlock.php index c5eda60ac..6c02b5216 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/SimpleBlock.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/SimpleBlock.php @@ -6,6 +6,7 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; use Wikimedia\CSS\Util; /** @@ -14,7 +15,10 @@ use Wikimedia\CSS\Util; class SimpleBlock extends ComponentValue { /** @var string */ - protected $startTokenType, $endTokenType; + protected $startTokenType; + + /** @var string */ + protected $endTokenType; /** @var ComponentValueList */ protected $value; @@ -25,18 +29,18 @@ class SimpleBlock extends ComponentValue { public function __construct( Token $token ) { $this->endTokenType = static::matchingDelimiter( $token->type() ); if ( $this->endTokenType === null ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( 'A SimpleBlock is delimited by either {}, [], or ().' ); } - list( $this->line, $this->pos ) = $token->getPosition(); + [ $this->line, $this->pos ] = $token->getPosition(); $this->startTokenType = $token->type(); $this->value = new ComponentValueList(); } public function __clone() { - $this->value = clone( $this->value ); + $this->value = clone $this->value; } /** @@ -51,7 +55,7 @@ class SimpleBlock extends ComponentValue { /** * Return the ending delimiter for a starting delimiter - * @param string Token::T_* constant + * @param string $delim Token::T_* constant * @return string|null Matching Token::T_* constant, if any */ public static function matchingDelimiter( $delim ) { @@ -91,12 +95,13 @@ class SimpleBlock extends ComponentValue { return $this->value; } + /** @inheritDoc */ public function toTokenArray() { $ret = [ new Token( $this->startTokenType, [ 'position' => [ $this->line, $this->pos ] ] ), ]; - // Manually looping and appending turns out to be noticably faster than array_merge. + // Manually looping and appending turns out to be noticeably faster than array_merge. $tokens = $this->value->toTokenArray(); if ( $tokens && $this->startTokenType === Token::T_LEFT_BRACE ) { if ( $tokens[0]->type() !== Token::T_WHITESPACE ) { diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/Stylesheet.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/Stylesheet.php index e7b048fcb..c20b23f86 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/Stylesheet.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/Stylesheet.php @@ -7,7 +7,6 @@ namespace Wikimedia\CSS\Objects; use Wikimedia\CSS\Util; -use Wikimedia\CSS\Sanitizer\Sanitizer; /** * Represent a stylesheet @@ -23,14 +22,14 @@ class Stylesheet implements CSSObject { protected $ruleList; /** - * @param RuleList $rules + * @param RuleList|null $rules */ public function __construct( RuleList $rules = null ) { $this->ruleList = $rules ?: new RuleList(); } public function __clone() { - $this->ruleList = clone( $this->ruleList ); + $this->ruleList = clone $this->ruleList; } /** @@ -40,15 +39,18 @@ class Stylesheet implements CSSObject { return $this->ruleList; } + /** @inheritDoc */ public function getPosition() { // Stylesheets don't really have a position return [ 0, 0 ]; } + /** @inheritDoc */ public function toTokenArray() { return $this->ruleList->toTokenArray(); } + /** @inheritDoc */ public function toComponentValueArray() { return $this->ruleList->toComponentValueArray(); } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/Token.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/Token.php index afbb02832..30672ddd5 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/Token.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/Token.php @@ -6,42 +6,38 @@ namespace Wikimedia\CSS\Objects; +use InvalidArgumentException; +use UnexpectedValueException; + /** * Represent a CSS token */ class Token extends ComponentValue { - const T_IDENT = "ident"; - const T_FUNCTION = "function"; - const T_AT_KEYWORD = "at-keyword"; - const T_HASH = "hash"; - const T_STRING = "string"; - const T_BAD_STRING = "bad-string"; - const T_URL = "url"; - const T_BAD_URL = "bad-url"; - const T_DELIM = "delim"; - const T_NUMBER = "number"; - const T_PERCENTAGE = "percentage"; - const T_DIMENSION = "dimension"; - const T_UNICODE_RANGE = "unicode-range"; - const T_INCLUDE_MATCH = "include-match"; - const T_DASH_MATCH = "dash-match"; - const T_PREFIX_MATCH = "prefix-match"; - const T_SUFFIX_MATCH = "suffix-match"; - const T_SUBSTRING_MATCH = "substring-match"; - const T_COLUMN = "column"; - const T_WHITESPACE = "whitespace"; - const T_CDO = "CDO"; - const T_CDC = "CDC"; - const T_COLON = "colon"; - const T_SEMICOLON = "semicolon"; - const T_COMMA = "comma"; - const T_LEFT_BRACKET = "["; - const T_RIGHT_BRACKET = "]"; - const T_LEFT_PAREN = "("; - const T_RIGHT_PAREN = ")"; - const T_LEFT_BRACE = "{"; - const T_RIGHT_BRACE = "}"; - const T_EOF = "EOF"; + public const T_IDENT = "ident"; + public const T_FUNCTION = "function"; + public const T_AT_KEYWORD = "at-keyword"; + public const T_HASH = "hash"; + public const T_STRING = "string"; + public const T_BAD_STRING = "bad-string"; + public const T_URL = "url"; + public const T_BAD_URL = "bad-url"; + public const T_DELIM = "delim"; + public const T_NUMBER = "number"; + public const T_PERCENTAGE = "percentage"; + public const T_DIMENSION = "dimension"; + public const T_WHITESPACE = "whitespace"; + public const T_CDO = "CDO"; + public const T_CDC = "CDC"; + public const T_COLON = "colon"; + public const T_SEMICOLON = "semicolon"; + public const T_COMMA = "comma"; + public const T_LEFT_BRACKET = "["; + public const T_RIGHT_BRACKET = "]"; + public const T_LEFT_PAREN = "("; + public const T_RIGHT_PAREN = ")"; + public const T_LEFT_BRACE = "{"; + public const T_RIGHT_BRACE = "}"; + public const T_EOF = "EOF"; /** @var string One of the T_* constants */ protected $type; @@ -58,12 +54,12 @@ class Token extends ComponentValue { /** @var string Unit for dimension tokens */ protected $unit = ''; - /** @var int Start and end for unicode-range tokens */ - protected $start = 0, $end = 0; - /** @var bool Whether this token is considered "significant" */ protected $significant = true; + /** @var int See ::urangeHack() */ + private $urangeHack = 0; + /** * @param string $type One of the T_* constants * @param string|array $value Value of the token, or an array with the @@ -78,8 +74,6 @@ class Token extends ComponentValue { * - representation: (string) String representation of the value for * T_NUMBER, T_PERCENTAGE, and T_DIMENSION. * - unit: (string) Unit for T_DIMENSION. - * - start: (int) Start code point for T_UNICODE_RANGE. - * - end: (int) End code point for T_UNICODE_RANGE. * - significant: (bool) Whether the token is considered "significant" */ public function __construct( $type, $value = [] ) { @@ -89,11 +83,11 @@ class Token extends ComponentValue { if ( isset( $value['position'] ) ) { if ( !is_array( $value['position'] ) || count( $value['position'] ) !== 2 ) { - throw new \InvalidArgumentException( 'Position must be an array of two integers' ); + throw new InvalidArgumentException( 'Position must be an array of two integers' ); } - list( $this->line, $this->pos ) = $value['position']; + [ $this->line, $this->pos ] = $value['position']; if ( !is_int( $this->line ) || !is_int( $this->pos ) ) { - throw new \InvalidArgumentException( 'Position must be an array of two integers' ); + throw new InvalidArgumentException( 'Position must be an array of two integers' ); } } if ( isset( $value['significant'] ) ) { @@ -108,20 +102,20 @@ class Token extends ComponentValue { case self::T_STRING: case self::T_URL: if ( !isset( $value['value'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a value" ); + throw new InvalidArgumentException( "Token type $this->type requires a value" ); } $this->value = (string)$value['value']; break; case self::T_HASH: if ( !isset( $value['value'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a value" ); + throw new InvalidArgumentException( "Token type $this->type requires a value" ); } if ( !isset( $value['typeFlag'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a typeFlag" ); + throw new InvalidArgumentException( "Token type $this->type requires a typeFlag" ); } if ( !in_array( $value['typeFlag'], [ 'id', 'unrestricted' ], true ) ) { - throw new \InvalidArgumentException( "Invalid type flag for Token type $this->type" ); + throw new InvalidArgumentException( "Invalid type flag for Token type $this->type" ); } $this->value = (string)$value['value']; $this->typeFlag = $value['typeFlag']; @@ -129,11 +123,11 @@ class Token extends ComponentValue { case self::T_DELIM: if ( !isset( $value['value'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a value" ); + throw new InvalidArgumentException( "Token type $this->type requires a value" ); } $this->value = (string)$value['value']; if ( mb_strlen( $this->value, 'UTF-8' ) !== 1 ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "Value for Token type $this->type must be a single character" ); } @@ -145,32 +139,32 @@ class Token extends ComponentValue { if ( !isset( $value['value'] ) || !is_numeric( $value['value'] ) || !is_finite( $value['value'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a numeric value" ); + throw new InvalidArgumentException( "Token type $this->type requires a numeric value" ); } if ( !isset( $value['typeFlag'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a typeFlag" ); + throw new InvalidArgumentException( "Token type $this->type requires a typeFlag" ); } $this->typeFlag = $value['typeFlag']; if ( $this->typeFlag === 'integer' ) { $this->value = (int)$value['value']; if ( (float)$this->value !== (float)$value['value'] ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "typeFlag is 'integer', but value supplied is not an integer" ); } } elseif ( $this->typeFlag === 'number' ) { $this->value = (float)$value['value']; } else { - throw new \InvalidArgumentException( "Invalid type flag for Token type $this->type" ); + throw new InvalidArgumentException( "Invalid type flag for Token type $this->type" ); } if ( isset( $value['representation'] ) ) { if ( !is_numeric( $value['representation'] ) ) { - throw new \InvalidArgumentException( 'Representation must be numeric' ); + throw new InvalidArgumentException( 'Representation must be numeric' ); } $this->representation = $value['representation']; if ( (float)$this->representation !== (float)$this->value ) { - throw new \InvalidArgumentException( + throw new InvalidArgumentException( "Representation \"$this->representation\" does not match value \"$this->value\"" ); } @@ -178,36 +172,14 @@ class Token extends ComponentValue { if ( $type === self::T_DIMENSION ) { if ( !isset( $value['unit'] ) ) { - throw new \InvalidArgumentException( "Token type $this->type requires a unit" ); + throw new InvalidArgumentException( "Token type $this->type requires a unit" ); } $this->unit = $value['unit']; } break; - case self::T_UNICODE_RANGE: - if ( !isset( $value['start'] ) || !is_int( $value['start'] ) ) { - throw new \InvalidArgumentException( - "Token type $this->type requires a starting code point as an integer" - ); - } - $this->start = $value['start']; - if ( !isset( $value['end'] ) ) { - $this->end = $this->start; - } elseif ( !is_int( $value['end'] ) ) { - throw new \InvalidArgumentException( 'Ending code point must be an integer' ); - } else { - $this->end = $value['end']; - } - break; - case self::T_BAD_STRING: case self::T_BAD_URL: - case self::T_INCLUDE_MATCH: - case self::T_DASH_MATCH: - case self::T_PREFIX_MATCH: - case self::T_SUFFIX_MATCH: - case self::T_SUBSTRING_MATCH: - case self::T_COLUMN: case self::T_WHITESPACE: case self::T_CDO: case self::T_CDC: @@ -228,13 +200,13 @@ class Token extends ComponentValue { if ( isset( $value['typeFlag'] ) && $value['typeFlag'] !== '' ) { $this->typeFlag = $value['typeFlag']; if ( $this->typeFlag !== 'recursion-depth-exceeded' ) { - throw new \InvalidArgumentException( "Invalid type flag for Token type $this->type" ); + throw new InvalidArgumentException( "Invalid type flag for Token type $this->type" ); } } break; default: - throw new \InvalidArgumentException( "Unknown token type \"$this->type\"." ); + throw new InvalidArgumentException( "Unknown token type \"$this->type\"." ); } } @@ -278,14 +250,6 @@ class Token extends ComponentValue { return $this->unit; } - /** - * Get the unicode range for this T_UNICODE_RANGE token - * @return array [ int $start, int $end ] - */ - public function range() { - return [ $this->start, $this->end ]; - } - /** * Whether this token is considered "significant" * @@ -309,22 +273,24 @@ class Token extends ComponentValue { if ( $significant === $this->significant ) { return $this; } - $ret = clone( $this ); + $ret = clone $this; $ret->significant = $significant; return $ret; } + /** @inheritDoc */ public function toTokenArray() { return [ $this ]; } + /** @inheritDoc */ public function toComponentValueArray() { switch ( $this->type ) { case self::T_FUNCTION: case self::T_LEFT_BRACKET: case self::T_LEFT_PAREN: case self::T_LEFT_BRACE: - throw new \UnexpectedValueException( + throw new UnexpectedValueException( "Token type \"$this->type\" is not valid in a ComponentValueList." ); @@ -341,20 +307,54 @@ class Token extends ComponentValue { private static function escapeIdent( $s ) { return preg_replace_callback( '/ - [^a-zA-Z0-9_\-\x{80}-\x{10ffff}] # Characters that are never allowed - | (?:^|(?<=^-))[0-9] # Digits are not allowed at the start of an identifier - | (?<=^-)- # Two dashes are not allowed at the start of an identifier + [^a-zA-Z0-9_\-\x{80}-\x{10ffff}] # Characters that are never allowed + | (?:^|(?<=^-))[0-9] # Digits are not allowed at the start of an identifier + | [\p{Z}\p{Cc}\p{Cf}\p{Co}\p{Cs}] # To be safe, control characters and whitespace /ux', - function ( $m ) { - if ( $m[0] === "\n" || ctype_xdigit( $m[0] ) ) { - return sprintf( '\\%x ', ord( $m[0] ) ); - } - return '\\' . $m[0]; - }, + [ __CLASS__, 'escapePregCallback' ], $s ); } + /** + * Escape characters in a string + * + * - Double quote needs escaping as the string delimiter. + * - Backslash needs escaping since it's the escape character. + * - Newline (\n) isn't valid in a string, and so needs escaping. + * - Carriage return (\r), form feed (\f), and U+0000 would be changed by + * CSS's input conversion rules, and so need escaping. + * - Other non-space whitespace and controls don't need escaping, but it's + * safer to do so. + * - Angle brackets are escaped numerically to make it safer to embed in HTML. + * + * @param string $s + * @return string + */ + private static function escapeString( $s ) { + return preg_replace_callback( + '/[^ \P{Z}]|[\p{Cc}\p{Cf}\p{Co}\p{Cs}"\x5c<>]/u', + [ __CLASS__, 'escapePregCallback' ], + $s + ); + } + + /** + * Callback for escaping functions + * @param array $m Matches + * @return string + */ + private static function escapePregCallback( $m ) { + // Newlines, carriage returns, form feeds, and hex digits have to be + // escaped numerically. Other non-space whitespace and controls don't + // have to be, but it's saner to do so. Angle brackets are escaped + // numerically too to make it safer to embed in HTML. + if ( preg_match( '/[^ \P{Z}]|[\p{Cc}\p{Cf}\p{Co}\p{Cs}0-9a-fA-F<>]/u', $m[0] ) ) { + return sprintf( '\\%x ', mb_ord( $m[0] ) ); + } + return '\\' . $m[0]; + } + public function __toString() { switch ( $this->type ) { case self::T_IDENT: @@ -369,29 +369,26 @@ class Token extends ComponentValue { case self::T_HASH: if ( $this->typeFlag === 'id' ) { return '#' . self::escapeIdent( $this->value ); - } else { - return '#' . preg_replace_callback( '/[^a-zA-Z0-9_\-\x{80}-\x{10ffff}]/u', function ( $m ) { - return $m[0] === "\n" ? '\\a ' : '\\' . $m[0]; - }, $this->value ); } + return '#' . preg_replace_callback( + '/ + [^a-zA-Z0-9_\-\x{80}-\x{10ffff}] # Characters that are never allowed + | [\p{Z}\p{Cc}\p{Cf}\p{Co}\p{Cs}] # To be safe, control characters and whitespace + /ux', + [ __CLASS__, 'escapePregCallback' ], + $this->value + ); + case self::T_STRING: // We could try to decide whether single or double quote is // better, but it doesn't seem worth the effort. - return '"' . strtr( $this->value, [ - '"' => '\\"', - '\\' => '\\\\', - "\n" => '\\a ', - ] ) . '"'; + return '"' . self::escapeString( $this->value ) . '"'; case self::T_URL: // We could try to decide whether single or double quote is // better, but it doesn't seem worth the effort. - return 'url("' . strtr( $this->value, [ - '"' => '\\"', - '\\' => '\\\\', - "\n" => '\\a ', - ] ) . '")'; + return 'url("' . self::escapeString( $this->value ) . '")'; case self::T_BAD_STRING: // It's supposed to round trip, so... @@ -435,41 +432,6 @@ class Token extends ComponentValue { return $number . $unit; - case self::T_UNICODE_RANGE: - if ( $this->start === 0 && $this->end === 0xffffff ) { - return 'U+??????'; - } - $fmt = 'U+%x'; - for ( $b = 0; $b < 24; $b += 4, $fmt .= '?' ) { - $mask = ( 1 << $b ) - 1; - if ( - ( $this->start & $mask ) === 0 && - ( $this->end & $mask ) === $mask && - ( $this->start & ~$mask ) === ( $this->end & ~$mask ) - ) { - return sprintf( $fmt, $this->start >> $b ); - } - } - return sprintf( 'U+%x-%x', $this->start, $this->end ); - - case self::T_INCLUDE_MATCH: - return '~='; - - case self::T_DASH_MATCH: - return '|='; - - case self::T_PREFIX_MATCH: - return '^='; - - case self::T_SUFFIX_MATCH: - return '$='; - - case self::T_SUBSTRING_MATCH: - return '*='; - - case self::T_COLUMN: - return '||'; - case self::T_WHITESPACE: return ' '; @@ -500,13 +462,13 @@ class Token extends ComponentValue { return ''; default: - throw new \UnexpectedValueException( "Unknown token type \"$this->type\"." ); + throw new UnexpectedValueException( "Unknown token type \"$this->type\"." ); } } /** * Indicate whether the two tokens need to be separated - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#serialization + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#serialization * @param Token $firstToken * @param Token $secondToken * @return bool @@ -516,53 +478,69 @@ class Token extends ComponentValue { static $sepTable = [ self::T_IDENT => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE, self::T_CDC, self::T_LEFT_PAREN + self::T_PERCENTAGE, self::T_DIMENSION, self::T_CDC, self::T_LEFT_PAREN, + // Internet Explorer is buggy in some contexts (T191134) + self::T_HASH, ], self::T_AT_KEYWORD => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE, self::T_CDC + self::T_PERCENTAGE, self::T_DIMENSION, self::T_CDC, ], self::T_HASH => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE, self::T_CDC + self::T_PERCENTAGE, self::T_DIMENSION, self::T_CDC, + // Internet Explorer is buggy in some contexts (T191134) + self::T_HASH, ], self::T_DIMENSION => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE, self::T_CDC + self::T_PERCENTAGE, self::T_DIMENSION, self::T_CDC, + // Internet Explorer is buggy in some contexts (T191134) + self::T_HASH, ], '#' => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE + self::T_PERCENTAGE, self::T_DIMENSION, ], '-' => [ - // Add '-' here from Editor's Draft, to go with the draft's - // adding of tokens beginning with "--" that we also picked up. self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE + self::T_PERCENTAGE, self::T_DIMENSION, ], self::T_NUMBER => [ self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, self::T_NUMBER, - self::T_PERCENTAGE, self::T_DIMENSION, self::T_UNICODE_RANGE + self::T_PERCENTAGE, self::T_DIMENSION, '%', + // Internet Explorer is buggy in some contexts + self::T_HASH, ], '@' => [ - self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', self::T_UNICODE_RANGE - ], - self::T_UNICODE_RANGE => [ - self::T_IDENT, self::T_FUNCTION, self::T_NUMBER, self::T_PERCENTAGE, self::T_DIMENSION, '?' + self::T_IDENT, self::T_FUNCTION, self::T_URL, self::T_BAD_URL, '-', ], '.' => [ self::T_NUMBER, self::T_PERCENTAGE, self::T_DIMENSION ], '+' => [ self::T_NUMBER, self::T_PERCENTAGE, self::T_DIMENSION ], - '$' => [ '=' ], - '*' => [ '=' ], - '^' => [ '=' ], - '~' => [ '=' ], - '|' => [ '=', '|' ], '/' => [ '*' ], ]; - $t1 = $firstToken->type === Token::T_DELIM ? $firstToken->value : $firstToken->type; - $t2 = $secondToken->type === Token::T_DELIM ? $secondToken->value : $secondToken->type; + $t1 = $firstToken->type === self::T_DELIM ? $firstToken->value : $firstToken->type; + $t2 = $secondToken->type === self::T_DELIM ? $secondToken->value : $secondToken->type; return isset( $sepTable[$t1] ) && in_array( $t2, $sepTable[$t1], true ); } + + /** + * Allow for marking the 'U' T_IDENT beginning a , to later avoid + * serializing it with extraneous comments. + * @internal + * @see \Wikimedia\CSS\Util::stringify() + * @see \Wikimedia\CSS\Grammar\UrangeMatcher + * @param int|null $hack Set the hack value + * @return int Current/old hack value + */ + public function urangeHack( $hack = null ) { + $ret = $this->urangeHack; + if ( $hack !== null ) { + $this->urangeHack = max( (int)$this->urangeHack, $hack ); + } + return $ret; + } + } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Objects/TokenList.php b/lib/css-sanitizer/Wikimedia/CSS/Objects/TokenList.php index 1e2873711..42c6db6b0 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Objects/TokenList.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Objects/TokenList.php @@ -6,25 +6,35 @@ namespace Wikimedia\CSS\Objects; +use UnexpectedValueException; use Wikimedia\CSS\Parser\Parser; /** * Represent a list of CSS tokens */ class TokenList extends CSSObjectList { + /** + * @var string + */ protected static $objectType = Token::class; - // We can greatly simplify this, assuming no separator + /** @var Token[] The objects contained */ + protected $objects; + + /** @inheritDoc */ public function toTokenArray() { + // We can greatly simplify this, assuming no separator return $this->objects; } - // This one, though, is complicated + /** @inheritDoc */ public function toComponentValueArray() { + // This one, though, is complicated $parser = Parser::newFromTokens( $this->objects ); $ret = $parser->parseComponentValueList(); if ( $parser->getParseErrors() ) { - $ex = new \UnexpectedValueException( 'TokenList cannot be converted to a ComponentValueList' ); + $ex = new UnexpectedValueException( 'TokenList cannot be converted to a ComponentValueList' ); + // @phan-suppress-next-line PhanUndeclaredProperty $ex->parseErrors = $parser->getParseErrors(); throw $ex; } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSource.php b/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSource.php index c4aadb1d6..b9dcdb0d2 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSource.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSource.php @@ -11,7 +11,7 @@ namespace Wikimedia\CSS\Parser; */ interface DataSource { - const EOF = ''; + public const EOF = ''; /** * Read a character from the data source. diff --git a/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSourceTokenizer.php b/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSourceTokenizer.php index 3fc0998a8..de6099350 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSourceTokenizer.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Parser/DataSourceTokenizer.php @@ -6,21 +6,28 @@ namespace Wikimedia\CSS\Parser; +use InvalidArgumentException; +use UnexpectedValueException; +use UtfNormal\Constants; +use UtfNormal\Utils; use Wikimedia\CSS\Objects\Token; /** * Parse CSS into tokens * * This implements the tokenizer from the CSS Syntax Module Level 3 candidate recommendation. - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/ + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/ */ class DataSourceTokenizer implements Tokenizer { /** @var DataSource */ protected $source; - /** @var int position in the input */ - protected $line = 1, $pos = 0; + /** @var int line in the input */ + protected $line = 1; + + /** @var int position in the line in the input */ + protected $pos = 0; /** @var string|null|object The most recently consumed character */ protected $currentCharacter = null; @@ -42,7 +49,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Read a character from the data source - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-preprocessing + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-preprocessing * @return string One UTF-8 character, or empty string on EOF */ protected function nextChar() { @@ -50,17 +57,19 @@ class DataSourceTokenizer implements Tokenizer { // Perform transformations per the spec - // Any U+0000 becomes U+FFFD - if ( $char === "\0" ) { - return \UtfNormal\Constants::UTF8_REPLACEMENT; + // Any U+0000 or surrogate code point becomes U+FFFD + if ( $char === "\0" || ( $char >= "\u{D800}" && $char <= "\u{DFFF}" ) ) { + return Constants::UTF8_REPLACEMENT; } // Any U+000D, U+000C, or pair of U+000D + U+000A becomes U+000A - if ( $char === "\f" ) { // U+000C + if ( $char === "\f" ) { + // U+000C return "\n"; } - if ( $char === "\r" ) { // Either U+000D + U+000A or a lone U+000D + if ( $char === "\r" ) { + // Either U+000D + U+000A or a lone U+000D $char2 = $this->source->readCharacter(); if ( $char2 !== "\n" ) { $this->source->putBackCharacter( $char2 ); @@ -90,13 +99,13 @@ class DataSourceTokenizer implements Tokenizer { /** * Reconsume the next character * - * In more normal terms, this pushes a character back onto the data source + * In more normal terms, this pushes a character back onto the data source, * so it will be read again for the next call to self::consumeCharacter(). */ protected function reconsumeCharacter() { // @codeCoverageIgnoreStart if ( !is_string( $this->currentCharacter ) ) { - throw new \UnexpectedValueException( "[$this->line:$this->pos] Can't reconsume" ); + throw new UnexpectedValueException( "[$this->line:$this->pos] Can't reconsume" ); } // @codeCoverageIgnoreEnd @@ -128,10 +137,12 @@ class DataSourceTokenizer implements Tokenizer { return $ret; } + /** @inheritDoc */ public function getParseErrors() { return $this->parseErrors; } + /** @inheritDoc */ public function clearParseErrors() { $this->parseErrors = []; } @@ -162,10 +173,13 @@ class DataSourceTokenizer implements Tokenizer { /** * Read a token from the data source - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-token + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-token * @return Token + * @suppress PhanPluginDuplicateAdjacentStatement,PhanPluginDuplicateSwitchCaseLooseEquality */ public function consumeToken() { + // We "consume comments" inline below, see `case '/'`. + $this->consumeCharacter(); $pos = [ 'position' => [ $this->line, $this->pos ] ]; @@ -185,7 +199,7 @@ class DataSourceTokenizer implements Tokenizer { return $this->consumeStringToken( $this->currentCharacter, $pos ); case '#': - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, $next3 ] = $this->lookAhead(); if ( self::isNameCharacter( $this->nextCharacter ) || self::isValidEscape( $next, $next2 ) ) { @@ -197,31 +211,15 @@ class DataSourceTokenizer implements Tokenizer { return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - case '$': - if ( $this->nextCharacter === '=' ) { - $this->consumeCharacter(); - return new Token( Token::T_SUFFIX_MATCH, $pos ); - } - - return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - case '(': return new Token( Token::T_LEFT_PAREN, $pos ); case ')': return new Token( Token::T_RIGHT_PAREN, $pos ); - case '*': - if ( $this->nextCharacter === '=' ) { - $this->consumeCharacter(); - return new Token( Token::T_SUBSTRING_MATCH, $pos ); - } - - return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - case '+': case '.': - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, ] = $this->lookAhead(); if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { $this->reconsumeCharacter(); return $this->consumeNumericToken( $pos ); @@ -233,7 +231,7 @@ class DataSourceTokenizer implements Tokenizer { return new Token( Token::T_COMMA, $pos ); case '-': - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, ] = $this->lookAhead(); if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { $this->reconsumeCharacter(); return $this->consumeNumericToken( $pos ); @@ -257,15 +255,16 @@ class DataSourceTokenizer implements Tokenizer { $this->consumeCharacter(); $this->consumeCharacter(); while ( $this->currentCharacter !== DataSource::EOF && + // @phan-suppress-next-line PhanSuspiciousValueComparisonInLoop !( $this->currentCharacter === '*' && $this->nextCharacter === '/' ) ) { $this->consumeCharacter(); } if ( $this->currentCharacter === DataSource::EOF ) { - // Parse error from the editor's draft as of 2017-01-06 $this->parseError( 'unclosed-comment', $pos ); } $this->consumeCharacter(); + // @phan-suppress-next-line PhanPossiblyInfiniteRecursionSameParams return $this->consumeToken(); } @@ -278,7 +277,7 @@ class DataSourceTokenizer implements Tokenizer { return new Token( Token::T_SEMICOLON, $pos ); case '<': - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, $next3 ] = $this->lookAhead(); if ( $next === '!' && $next2 === '-' && $next3 === '-' ) { $this->consumeCharacter(); $this->consumeCharacter(); @@ -289,7 +288,7 @@ class DataSourceTokenizer implements Tokenizer { return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); case '@': - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, $next3 ] = $this->lookAhead(); if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { return new Token( Token::T_AT_KEYWORD, $pos + [ 'value' => $this->consumeName() ] ); } @@ -311,14 +310,6 @@ class DataSourceTokenizer implements Tokenizer { case ']': return new Token( Token::T_RIGHT_BRACKET, $pos ); - case '^': - if ( $this->nextCharacter === '=' ) { - $this->consumeCharacter(); - return new Token( Token::T_PREFIX_MATCH, $pos ); - } - - return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - case '{': return new Token( Token::T_LEFT_BRACE, $pos ); @@ -338,40 +329,6 @@ class DataSourceTokenizer implements Tokenizer { $this->reconsumeCharacter(); return $this->consumeNumericToken( $pos ); - case 'u': - case 'U': - if ( $this->nextCharacter === '+' ) { - list( $next, $next2 ) = $this->lookAhead(); - if ( self::isHexDigit( $next2 ) || $next2 === '?' ) { - $this->consumeCharacter(); - return $this->consumeUnicodeRangeToken( $pos ); - } - } - - $this->reconsumeCharacter(); - return $this->consumeIdentLikeToken( $pos ); - - case '|': - if ( $this->nextCharacter === '=' ) { - $this->consumeCharacter(); - return new Token( Token::T_DASH_MATCH, $pos ); - } - - if ( $this->nextCharacter === '|' ) { - $this->consumeCharacter(); - return new Token( Token::T_COLUMN, $pos ); - } - - return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - - case '~': - if ( $this->nextCharacter === '=' ) { - $this->consumeCharacter(); - return new Token( Token::T_INCLUDE_MATCH, $pos ); - } - - return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); - case DataSource::EOF: return new Token( Token::T_EOF, $pos ); @@ -387,14 +344,14 @@ class DataSourceTokenizer implements Tokenizer { /** * Consume a numeric token - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-numeric-token + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-numeric-token * @param array $data Data for the new token (typically contains just 'position') * @return Token */ protected function consumeNumericToken( array $data ) { - list( $data['representation'], $data['value'], $data['typeFlag'] ) = $this->consumeNumber(); + [ $data['representation'], $data['value'], $data['typeFlag'] ] = $this->consumeNumber(); - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, $next3 ] = $this->lookAhead(); if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { return new Token( Token::T_DIMENSION, $data + [ 'unit' => $this->consumeName() ] ); } elseif ( $this->nextCharacter === '%' ) { @@ -407,10 +364,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Consume an ident-like token - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-ident-like-token - * @note Per the draft as of January 2017, quoted URLs are parsed as - * functions named 'url'. This is needed in order to implement the `` - * type in the [Values specification](https://www.w3.org/TR/2016/CR-css-values-3-20160929/#urls). + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-ident-like-token * @param array $data Data for the new token (typically contains just 'position') * @return Token */ @@ -422,14 +376,14 @@ class DataSourceTokenizer implements Tokenizer { if ( !strcasecmp( $name, 'url' ) ) { while ( true ) { - list( $next, $next2 ) = $this->lookAhead(); + [ $next, $next2 ] = $this->lookAhead(); if ( !self::isWhitespace( $next ) || !self::isWhitespace( $next2 ) ) { break; } $this->consumeCharacter(); } if ( $next !== '"' && $next !== '\'' && - !( self::isWhitespace( $next ) && ( $next2 === '"' || $next2=== '\'' ) ) + !( self::isWhitespace( $next ) && ( $next2 === '"' || $next2 === '\'' ) ) ) { return $this->consumeUrlToken( $data ); } @@ -446,7 +400,7 @@ class DataSourceTokenizer implements Tokenizer { * * This assumes the leading quote or apostrophe has already been consumed. * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-string-token + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-string-token * @param string $endChar Ending character of the string * @param array $data Data for the new token (typically contains just 'position') * @return Token @@ -458,7 +412,6 @@ class DataSourceTokenizer implements Tokenizer { $this->consumeCharacter(); switch ( $this->currentCharacter ) { case DataSource::EOF: - // Parse error from the editor's draft as of 2017-01-06 $this->parseError( 'unclosed-string', $data ); break 2; @@ -473,8 +426,6 @@ class DataSourceTokenizer implements Tokenizer { case '\\': if ( $this->nextCharacter === DataSource::EOF ) { // Do nothing - // Parse error from the editor's draft as of 2017-01-06 - $this->parseError( 'bad-escape' ); } elseif ( $this->nextCharacter === "\n" ) { // Consume it $this->consumeCharacter(); @@ -482,7 +433,7 @@ class DataSourceTokenizer implements Tokenizer { $data['value'] .= $this->consumeEscape(); } else { // @codeCoverageIgnoreStart - throw new \UnexpectedValueException( "[$this->line:$this->pos] Unexpected state" ); + throw new UnexpectedValueException( "[$this->line:$this->pos] Unexpected state" ); // @codeCoverageIgnoreEnd } break; @@ -493,6 +444,7 @@ class DataSourceTokenizer implements Tokenizer { } } + // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2 return new Token( Token::T_STRING, $data ); } @@ -501,8 +453,7 @@ class DataSourceTokenizer implements Tokenizer { * * This assumes the leading "url(" has already been consumed. * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-url-token - * @note Per the draft as of January 2017, this does not handle quoted URL tokens. + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-url-token * @param array $data Data for the new token (typically contains just 'position') * @return Token */ @@ -516,29 +467,23 @@ class DataSourceTokenizer implements Tokenizer { } // 3. - if ( $this->nextCharacter === DataSource::EOF ) { - // Parse error from the editor's draft as of 2017-01-06 - $this->parseError( 'unclosed-url', $data ); - return new Token( Token::T_URL, $data ); - } - - // 4. (removed in draft, this was formerly the parsing for a quoted URL token) - - // 5. (renumbered as 4 in the draft) while ( true ) { $this->consumeCharacter(); switch ( $this->currentCharacter ) { case DataSource::EOF: - // Parse error from the editor's draft as of 2017-01-06 $this->parseError( 'unclosed-url', $data ); break 2; + // @codeCoverageIgnoreStart case ')': + // @codeCoverageIgnoreEnd break 2; + // @codeCoverageIgnoreStart case "\n": case "\t": case ' ': + // @codeCoverageIgnoreEnd while ( self::isWhitespace( $this->nextCharacter ) ) { $this->consumeCharacter(); } @@ -546,7 +491,6 @@ class DataSourceTokenizer implements Tokenizer { $this->consumeCharacter(); break 2; } elseif ( $this->nextCharacter === DataSource::EOF ) { - // Parse error from the editor's draft as of 2017-01-06 $this->consumeCharacter(); $this->parseError( 'unclosed-url', $data ); break 2; @@ -554,16 +498,19 @@ class DataSourceTokenizer implements Tokenizer { $this->consumeBadUrlRemnants(); return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); } - break; + // @codeCoverageIgnoreStart case '"': case '\'': case '(': + // @codeCoverageIgnoreEnd $this->parseError( 'bad-character-in-url' ); $this->consumeBadUrlRemnants(); return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); + // @codeCoverageIgnoreStart case '\\': + // @codeCoverageIgnoreEnd if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { $data['value'] .= $this->consumeEscape(); } else { @@ -585,12 +532,13 @@ class DataSourceTokenizer implements Tokenizer { } } + // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2 return new Token( Token::T_URL, $data ); } /** * Clean up after finding an error in a URL - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-the-remnants-of-a-bad-url + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-remnants-of-bad-url */ protected function consumeBadUrlRemnants() { while ( true ) { @@ -604,61 +552,9 @@ class DataSourceTokenizer implements Tokenizer { } } - /** - * Consume a unicode-range token - * - * This assumes the initial "u" has been consumed (currentCharacter is the '+'), - * and the next codepoint is verfied to be a hex digit or "?". - * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-unicode-range-token - * @param array $data Data for the new token (typically contains just 'position') - * @return Token - */ - protected function consumeUnicodeRangeToken( array $data ) { - // 1. - $v = ''; - while ( strlen( $v ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { - $this->consumeCharacter(); - $v .= $this->currentCharacter; - } - $anyQ = false; - while ( strlen( $v ) < 6 && $this->nextCharacter === '?' ) { - $anyQ = true; - $this->consumeCharacter(); - $v .= $this->currentCharacter; - } - - if ( $anyQ ) { - return new Token( Token::T_UNICODE_RANGE, $data + [ - 'start' => intval( str_replace( '?', '0', $v ), 16 ), - 'end' => intval( str_replace( '?', 'F', $v ), 16 ), - ] ); - } - - $data['start'] = intval( $v, 16 ); - - // 2. - list( $next, $next2 ) = $this->lookAhead(); - if ( $next === '-' && self::isHexDigit( $next2 ) ) { - $this->consumeCharacter(); - $v = ''; - while ( strlen( $v ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { - $this->consumeCharacter(); - $v .= $this->currentCharacter; - } - $data['end'] = intval( $v, 16 ); - } else { - // 3. - $data['end'] = $data['start']; - } - - // 4. - return new Token( Token::T_UNICODE_RANGE, $data ); - } - /** * Indicate if a character is whitespace - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#whitespace + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#whitespace * @param string $char A single UTF-8 character * @return bool */ @@ -668,7 +564,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Indicate if a character is a name-start code point - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#name-start-code-point + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-start-code-point * @param string $char A single UTF-8 character * @return bool */ @@ -676,14 +572,14 @@ class DataSourceTokenizer implements Tokenizer { // Every non-ASCII character is a name start character, so we can just // check the first byte. $char = ord( $char ); - return $char >= 0x41 && $char <= 0x5a || - $char >= 0x61 && $char <= 0x7a || + return ( $char >= 0x41 && $char <= 0x5a ) || + ( $char >= 0x61 && $char <= 0x7a ) || $char >= 0x80 || $char === 0x5f; } /** * Indicate if a character is a name code point - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#name-code-point + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-code-point * @param string $char A single UTF-8 character * @return bool */ @@ -691,15 +587,15 @@ class DataSourceTokenizer implements Tokenizer { // Every non-ASCII character is a name character, so we can just check // the first byte. $char = ord( $char ); - return $char >= 0x41 && $char <= 0x5a || - $char >= 0x61 && $char <= 0x7a || - $char >= 0x30 && $char <= 0x39 || + return ( $char >= 0x41 && $char <= 0x5a ) || + ( $char >= 0x61 && $char <= 0x7a ) || + ( $char >= 0x30 && $char <= 0x39 ) || $char >= 0x80 || $char === 0x5f || $char === 0x2d; } /** * Indicate if a character is non-printable - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#non-printable-code-point + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#non-printable-code-point * @param string $char A single UTF-8 character * @return bool */ @@ -707,15 +603,15 @@ class DataSourceTokenizer implements Tokenizer { // No non-ASCII character is non-printable, so we can just check the // first byte. $char = ord( $char ); - return $char >= 0x00 && $char <= 0x08 || + return ( $char >= 0x00 && $char <= 0x08 ) || $char === 0x0b || - $char >= 0x0e && $char <= 0x1f || + ( $char >= 0x0e && $char <= 0x1f ) || $char === 0x7f; } /** * Indicate if a character is a digit - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#digit + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#digit * @param string $char A single UTF-8 character * @return bool */ @@ -728,7 +624,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Indicate if a character is a hex digit - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#hex-digit + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#hex-digit * @param string $char A single UTF-8 character * @return bool */ @@ -736,14 +632,14 @@ class DataSourceTokenizer implements Tokenizer { // No non-ASCII character is a hex digit, so we can just check the // first byte. $char = ord( $char ); - return $char >= 0x30 && $char <= 0x39 || - $char >= 0x41 && $char <= 0x46 || - $char >= 0x61 && $char <= 0x66; + return ( $char >= 0x30 && $char <= 0x39 ) || + ( $char >= 0x41 && $char <= 0x46 ) || + ( $char >= 0x61 && $char <= 0x66 ); } /** * Determine if two characters constitute a valid escape - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#starts-with-a-valid-escape + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-valid-escape * @param string $char1 * @param string $char2 * @return bool @@ -754,7 +650,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Determine if three characters would start an identifier - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#would-start-an-identifier + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#would-start-an-identifier * @param string $char1 * @param string $char2 * @param string $char3 @@ -762,7 +658,6 @@ class DataSourceTokenizer implements Tokenizer { */ protected static function wouldStartIdentifier( $char1, $char2, $char3 ) { if ( $char1 === '-' ) { - // Added the possibility for an itentifier beginning with "--" per the draft. return self::isNameStartCharacter( $char2 ) || $char2 === '-' || self::isValidEscape( $char2, $char3 ); } elseif ( self::isNameStartCharacter( $char1 ) ) { @@ -776,7 +671,7 @@ class DataSourceTokenizer implements Tokenizer { /** * Determine if three characters would start a number - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#starts-with-a-number + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-number * @param string $char1 * @param string $char2 * @param string $char3 @@ -785,7 +680,7 @@ class DataSourceTokenizer implements Tokenizer { protected static function wouldStartNumber( $char1, $char2, $char3 ) { if ( $char1 === '+' || $char1 === '-' ) { return self::isDigit( $char2 ) || - $char2 === '.' && self::isDigit( $char3 ); + ( $char2 === '.' && self::isDigit( $char3 ) ); } elseif ( $char1 === '.' ) { return self::isDigit( $char2 ); // @codeCoverageIgnoreStart @@ -801,7 +696,7 @@ class DataSourceTokenizer implements Tokenizer { * * This assumes the leading backslash is consumed. * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-escaped-code-point + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-escaped-code-point * @return string Escaped character */ protected function consumeEscape() { @@ -809,12 +704,6 @@ class DataSourceTokenizer implements Tokenizer { $this->consumeCharacter(); - // @codeCoverageIgnoreStart - if ( $this->currentCharacter === "\n" ) { - throw new \UnexpectedValueException( "[$this->line:$this->pos] Unexpected newline" ); - } - // @codeCoverageIgnoreEnd - // 1-6 hexits, plus one optional whitespace character if ( self::isHexDigit( $this->currentCharacter ) ) { $num = $this->currentCharacter; @@ -827,16 +716,15 @@ class DataSourceTokenizer implements Tokenizer { } $num = intval( $num, 16 ); - if ( $num === 0 || $num >= 0xd800 && $num <= 0xdfff || $num > 0x10ffff ) { - return \UtfNormal\Constants::UTF8_REPLACEMENT; + if ( $num === 0 || ( $num >= 0xd800 && $num <= 0xdfff ) || $num > 0x10ffff ) { + return Constants::UTF8_REPLACEMENT; } - return \UtfNormal\Utils::codepointToUtf8( $num ); + return Utils::codepointToUtf8( $num ); } if ( $this->currentCharacter === DataSource::EOF ) { - // Parse error from the editor's draft as of 2017-01-06 $this->parseError( 'bad-escape', $position ); - return \UtfNormal\Constants::UTF8_REPLACEMENT; + return Constants::UTF8_REPLACEMENT; } return $this->currentCharacter; @@ -849,7 +737,7 @@ class DataSourceTokenizer implements Tokenizer { * self::wouldStartIdentifier() or the like before calling the method if * necessary. * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-name + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-name * @return string Name */ protected function consumeName() { @@ -863,13 +751,13 @@ class DataSourceTokenizer implements Tokenizer { } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { $name .= $this->consumeEscape(); } else { - $this->reconsumeCharacter(); // Doesn't say to, but breaks otherwise - return $name; + $this->reconsumeCharacter(); + break; } } - // @codeCoverageIgnoreStart + + return $name; } - // @codeCoverageIgnoreEnd /** * Consume a number @@ -877,8 +765,9 @@ class DataSourceTokenizer implements Tokenizer { * Note this does not do validation on the input stream. Call * self::wouldStartNumber() before calling the method if necessary. * - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-number + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-number * @return array [ string $value, int|float $number, string $type ('integer' or 'number') ] + * @suppress PhanPluginDuplicateAdjacentStatement */ protected function consumeNumber() { // 1. @@ -899,7 +788,7 @@ class DataSourceTokenizer implements Tokenizer { // 4. if ( $this->nextCharacter === '.' ) { - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, ] = $this->lookAhead(); if ( self::isDigit( $next2 ) ) { // 4.1. $this->consumeCharacter(); @@ -918,7 +807,7 @@ class DataSourceTokenizer implements Tokenizer { // 5. if ( $this->nextCharacter === 'e' || $this->nextCharacter === 'E' ) { - list( $next, $next2, $next3 ) = $this->lookAhead(); + [ $next, $next2, $next3 ] = $this->lookAhead(); $ok = false; if ( ( $next2 === '+' || $next2 === '-' ) && self::isDigit( $next3 ) ) { $ok = true; @@ -948,7 +837,7 @@ class DataSourceTokenizer implements Tokenizer { } // 6. We assume PHP's casting follows the same rules as - // https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#convert-a-string-to-a-number + // https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#convert-string-to-number $value = $type === 'integer' ? (int)$repr : (float)$repr; // 7. diff --git a/lib/css-sanitizer/Wikimedia/CSS/Parser/Encoder.php b/lib/css-sanitizer/Wikimedia/CSS/Parser/Encoder.php index 5691d987e..99b0ab057 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Parser/Encoder.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Parser/Encoder.php @@ -6,9 +6,15 @@ namespace Wikimedia\CSS\Parser; +use RuntimeException; +use UtfNormal\Constants; +use UtfNormal\Utils; +use Wikimedia\AtEase\AtEase; + /** * Character set conversion for CSS - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream + * + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-byte-stream */ class Encoder { @@ -96,9 +102,12 @@ class Encoder { 'iso_8859-8' => 'ISO-8859-8', 'iso_8859-8:1988' => 'ISO-8859-8', 'visual' => 'ISO-8859-8', - 'csiso88598i' => 'ISO-8859-8', // ISO-8859-8-I? - 'iso-8859-8-i' => 'ISO-8859-8', // ISO-8859-8-I? - 'logical' => 'ISO-8859-8', // ISO-8859-8-I? + // ISO-8859-8-I? + 'csiso88598i' => 'ISO-8859-8', + // ISO-8859-8-I? + 'iso-8859-8-i' => 'ISO-8859-8', + // ISO-8859-8-I? + 'logical' => 'ISO-8859-8', 'csisolatin6' => 'ISO-8859-10', 'iso-8859-10' => 'ISO-8859-10', 'iso-ir-157' => 'ISO-8859-10', @@ -188,15 +197,24 @@ class Encoder { 'x-cp1258' => 'Windows-1258', 'x-mac-cyrillic' => 'mac-cyrillic', 'x-mac-ukrainian' => 'mac-cyrillic', - 'chinese' => 'GB18030', // GBK - 'csgb2312' => 'GB18030', // GBK - 'csiso58gb231280' => 'GB18030', // GBK - 'gb2312' => 'GB18030', // GBK - 'gb_2312' => 'GB18030', // GBK - 'gb_2312-80' => 'GB18030', // GBK - 'gbk' => 'GB18030', // GBK - 'iso-ir-58' => 'GB18030', // GBK - 'x-gbk' => 'GB18030', // GBK + // GBK + 'chinese' => 'GB18030', + // GBK + 'csgb2312' => 'GB18030', + // GBK + 'csiso58gb231280' => 'GB18030', + // GBK + 'gb2312' => 'GB18030', + // GBK + 'gb_2312' => 'GB18030', + // GBK + 'gb_2312-80' => 'GB18030', + // GBK + 'gbk' => 'GB18030', + // GBK + 'iso-ir-58' => 'GB18030', + // GBK + 'x-gbk' => 'GB18030', 'gb18030' => 'GB18030', 'big5' => 'BIG-5', 'big5-hkscs' => 'BIG-5', @@ -231,6 +249,7 @@ class Encoder { 'iso-2022-cn' => 'replacement', 'iso-2022-cn-ext' => 'replacement', 'iso-2022-kr' => 'replacement', + 'replacement' => 'replacement', 'utf-16be' => 'UTF-16BE', 'utf-16' => 'UTF-16LE', 'utf-16le' => 'UTF-16LE', @@ -247,7 +266,7 @@ class Encoder { */ public static function convert( $text, $encodings = [] ) { // First, check for a BOM and honor that if it's present. - if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) { + if ( strpos( $text, "\xef\xbb\xbf" ) === 0 ) { // UTF-8 with BOM (convert it anyway in case the BOM is a lie) return self::doConvert( 'UTF-8', substr( $text, 3 ) ); } @@ -300,13 +319,13 @@ class Encoder { protected static function doConvert( $encoding, $text ) { // Pseudo-encoding that just outputs one replacement character if ( $encoding === 'replacement' ) { - return \UtfNormal\Constants::UTF8_REPLACEMENT; + return Constants::UTF8_REPLACEMENT; } // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area if ( $encoding === 'x-user-defined' ) { - return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) { - return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) ); + return preg_replace_callback( '/[\x80-\xff]/', static function ( $m ) { + return Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) ); }, $text ); } @@ -315,15 +334,15 @@ class Encoder { // some encodings mbstring doesn't support. if ( in_array( $encoding, mb_list_encodings(), true ) ) { $old = mb_substitute_character(); - mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT ); + mb_substitute_character( Constants::UNICODE_REPLACEMENT ); $text = mb_convert_encoding( $text, 'UTF-8', $encoding ); mb_substitute_character( $old ); return $text; } - $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text ); + $ret = AtEase::quietCall( 'iconv', $encoding, 'UTF-8', $text ); if ( $ret === false ) { - throw new \RuntimeException( "Cannot convert '$text' from $encoding" ); + throw new RuntimeException( "Cannot convert '$text' from $encoding" ); } return $ret; } diff --git a/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php b/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php index d1f152f9f..f253c153f 100644 --- a/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php +++ b/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php @@ -7,29 +7,28 @@ namespace Wikimedia\CSS\Parser; use Wikimedia\CSS\Objects\AtRule; -use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\ComponentValue; +use Wikimedia\CSS\Objects\ComponentValueList; use Wikimedia\CSS\Objects\CSSFunction; +use Wikimedia\CSS\Objects\Declaration; use Wikimedia\CSS\Objects\DeclarationList; use Wikimedia\CSS\Objects\DeclarationOrAtRuleList; -use Wikimedia\CSS\Objects\Declaration; use Wikimedia\CSS\Objects\QualifiedRule; use Wikimedia\CSS\Objects\Rule; use Wikimedia\CSS\Objects\RuleList; use Wikimedia\CSS\Objects\SimpleBlock; use Wikimedia\CSS\Objects\Stylesheet; use Wikimedia\CSS\Objects\Token; -use Wikimedia\CSS\Sanitizer\Sanitizer; // Note: While reading the code below, you might find that my calls to -// consumeToken() don't match what the spec says and I don't ever "reconsume" a +// consumeToken() don't match what the spec says, and I don't ever "reconsume" a // token. It turns out that the spec is overcomplicated and confused with // respect to the "current input token" and the "next input token". It turns // out things are pretty simple: every "consume an X" is called with the // current input token being the first token of X, and returns with the current // input token being the last token of X (or EOF if X ends at EOF). -// Also of note is that, since our Tokenizer can only return a stream of tokens +// Also, of note is that, since our Tokenizer can only return a stream of tokens // rather than a stream of component values, the consume functions here only // consider tokens. ComponentValueList::toTokenArray() may be used to convert a // list of component values to a list of tokens if necessary. @@ -38,15 +37,19 @@ use Wikimedia\CSS\Sanitizer\Sanitizer; * Parse CSS into a structure for further processing. * * This implements the CSS Syntax Module Level 3 candidate recommendation. - * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/ + * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/ * * The usual entry points are: * - Parser::parseStylesheet() to parse a stylesheet or the contents of a