From 88a8e284b2825cbd77c75dbbbf3655a961d9eb09 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 17 Dec 2020 13:56:24 +0100 Subject: [PATCH] Fix metadata extract regex (2) Reference: https://stackoverflow.com/questions/8055727/negating-a-backreference-in-regular-expressions Fixes #1656 --- application/bookmark/LinkUtils.php | 6 ++++-- tests/bookmark/LinkUtilsTest.php | 10 ++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index d65e97ed..0ab2d213 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -68,11 +68,13 @@ function html_extract_tag($tag, $html) $properties = implode('|', $propertiesKey); // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' $orCondition = '["\']?(?:og:)?' . $tag . '["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; + // Support quotes in double quoted content, and the other way around + $content = 'content=(["\'])((?:(?!\1).)*)\1'; // Try to retrieve OpenGraph tag. - $ogRegex = '#]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; + $ogRegex = '#]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*' . $content . '.*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#'; + $ogRegexReverse = '#]+' . $content . '[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#'; if ( preg_match($ogRegex, $html, $matches) > 0 diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index ddab4e3c..46a7f1fe 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -245,6 +245,16 @@ public function testHtmlExtractNonExistentOgTag() $this->assertFalse(html_extract_tag('description', $html)); } + public function testHtmlExtractDescriptionFromGoogleRealCase(): void + { + $html = 'id="gsr">'. + ''. + 'assertSame('Bonnes fêtes de fin d\'année ! #GoogleDoodle', html_extract_tag('description', $html)); + } + /** * Test the header callback with valid value */