From b7c69d13b20b3b83eaedfc1cb6c01ffd1114cff5 Mon Sep 17 00:00:00 2001 From: darklee36 Date: Tue, 18 Apr 2023 11:10:59 +0200 Subject: [PATCH 1/2] add test and bugfix in case the href content of 'base' tag is a relative url. --- src/UsesUrls.php | 10 +++++++-- tests/BaseHrefTest.php | 14 ++++++++++++ tests/UrlTest.php | 48 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/UsesUrls.php b/src/UsesUrls.php index 25f0334..71431af 100644 --- a/src/UsesUrls.php +++ b/src/UsesUrls.php @@ -41,7 +41,13 @@ public function currentHost(): string */ public function currentBaseHost(): string { - $uri = Uri::createFromString($this->baseHref() ?? $this->currentUrl()); + //In case baseHref is a relative URL + $currentBase = $this->baseHref(); + if ($currentBase === null || !preg_match('/^https?:\/\//', $currentBase)) { + $currentBase = $this->currentUrl(); + } + + $uri = Uri::createFromString($currentBase); return $uri->getScheme() . '://' . $uri->getHost(); } @@ -61,7 +67,7 @@ public function makeUrlAbsolute(?string $url = null, string $baseUrl = null): ?s // Resolve the Url using one of the provided/set base href. return (string) UriResolver::resolve( Http::createFromString($url), - Http::createFromString($baseUrl ?? $this->baseHref() ?? $this->currentBaseHost()), + Http::createFromString($baseUrl ?? $this->currentBaseHost()), ); } } diff --git a/tests/BaseHrefTest.php b/tests/BaseHrefTest.php index 63782f9..69a4cdd 100644 --- a/tests/BaseHrefTest.php +++ b/tests/BaseHrefTest.php @@ -41,4 +41,18 @@ public function testBaseHref() $web->baseHref ); } + + public function testBaseHrefContainRelativePath() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page. + // Contains: (relative path) + $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + // Check the baseHref + $this->assertSame( + '/myglasgow/digitalaccessibility/', + $web->baseHref + ); + } } diff --git a/tests/UrlTest.php b/tests/UrlTest.php index 822bb41..572b6eb 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -67,6 +67,24 @@ public function testCurrentBaseHostWithBase() ); } + /** + * @test + */ + public function testCurrentBaseHostWithBaseIsRelativeUri() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page. + // Contains: + $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + + // Check the base href being passed through the current base host. + $this->assertSame( + 'https://www.gla.ac.uk', + $web->currentBaseHost + ); + } + /** * Basic processing of the URLs. * @@ -167,6 +185,36 @@ public function testMakeUrlAbsoluteConsiderBaseHref() ); } + /** + * Special case where the base href is a relative URL. So we need to use the current base host. + * + * @test + */ + public function testMakeUrlAbsoluteConsiderBaseHrefIsRelativeUrl() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + /** + * Navigate to test page: This sets the base URL. + * + * It contains: + * + * ```html + * + * ``` + * + * While it's located on `test-pages.phpscraper.de`. + * + * This page isn't actually used. It's purely to set the context. + */ + $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + + $this->assertSame( + 'https://www.gla.ac.uk/test/index.html', + $web->makeUrlAbsolute('test/index.html'), + ); + } + /** * Test if passed in hosts are considered. It trumps any base-href and current url. * From 9243395a6bf617831319a6af236df3dcc41ec6b3 Mon Sep 17 00:00:00 2001 From: pierrick-delamotte Date: Tue, 2 May 2023 14:27:30 +0200 Subject: [PATCH 2/2] change url from glasgow university to a phpscraper-test-pages one's --- tests/BaseHrefTest.php | 8 ++++---- tests/UrlTest.php | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/BaseHrefTest.php b/tests/BaseHrefTest.php index 69a4cdd..b4be71a 100644 --- a/tests/BaseHrefTest.php +++ b/tests/BaseHrefTest.php @@ -44,14 +44,14 @@ public function testBaseHref() public function testBaseHrefContainRelativePath() { - $web = new \Spekulatius\PHPScraper\PHPScraper; + $web = new \Spekulatius\PHPScraper\PHPScraper(['disable_ssl' => true]); // Navigate to the test page. - // Contains: (relative path) - $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + // Contains: (relative path) + $web->go('https://test-pages.phpscraper.de/links/invalid-base-href.html'); // Check the baseHref $this->assertSame( - '/myglasgow/digitalaccessibility/', + '/links/invalid-base-href.html', $web->baseHref ); } diff --git a/tests/UrlTest.php b/tests/UrlTest.php index 572b6eb..f695edf 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -75,12 +75,12 @@ public function testCurrentBaseHostWithBaseIsRelativeUri() $web = new \Spekulatius\PHPScraper\PHPScraper; // Navigate to the test page. - // Contains: - $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + // Contains: + $web->go('https://test-pages.phpscraper.de/links/invalid-base-href.html'); // Check the base href being passed through the current base host. $this->assertSame( - 'https://www.gla.ac.uk', + 'https://test-pages.phpscraper.de', $web->currentBaseHost ); } @@ -200,17 +200,17 @@ public function testMakeUrlAbsoluteConsiderBaseHrefIsRelativeUrl() * It contains: * * ```html - * + * * ``` * * While it's located on `test-pages.phpscraper.de`. * * This page isn't actually used. It's purely to set the context. */ - $web->go('https://www.gla.ac.uk/myglasgow/digitalaccessibility/'); + $web->go('https://test-pages.phpscraper.de/links/invalid-base-href.html'); $this->assertSame( - 'https://www.gla.ac.uk/test/index.html', + 'https://test-pages.phpscraper.de/test/index.html', $web->makeUrlAbsolute('test/index.html'), ); }