diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index c1e93c7..9616a4d 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -9,7 +9,11 @@ use DOMNodeList; /** - * @method \DOMNode removeAttribute($name) + * @property ?DOMNode $firstChild + * @property ?DOMNode $lastChild + * @property ?DOMNode $parentNode + * @property ?DOMNode $nextSibling + * @property ?DOMNode $previousSibling */ trait NodeTrait { @@ -525,7 +529,6 @@ public function isWhitespace() */ public function shiftingAwareGetElementsByTagName($tag) { - /** @var $nodes DOMNodeList */ $nodes = $this->getElementsByTagName($tag); $count = $nodes->length; @@ -547,13 +550,13 @@ public function shiftingAwareGetElementsByTagName($tag) * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this * function to get the first one that is an DOMElement node. * - * @return \DOMElement|null + * @return DOMElement|null */ public function getFirstElementChild() { if ($this->childNodes instanceof \Traversable) { foreach ($this->childNodes as $node) { - if ($node instanceof \DOMElement) { + if ($node instanceof DOMElement) { return $node; } } diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php index 56de705..b992096 100644 --- a/src/Nodes/NodeUtility.php +++ b/src/Nodes/NodeUtility.php @@ -70,7 +70,7 @@ public static function nextNode($node) * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new * element with the new tag name and importing it to the main DOMDocument. * - * @param DOMNode $node + * @param DOMNode|DOMElement $node * @param string $value * @param bool $importAttributes * @@ -136,7 +136,7 @@ public static function removeNode($node) * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally * for parents. * - * @param DOMNode $originalNode + * @param DOMNode|DOMElement $originalNode * @param bool $ignoreSelfAndKids * * @return DOMNode diff --git a/src/Readability.php b/src/Readability.php index a070c6a..f809593 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -300,7 +300,7 @@ public function parse(?string $html = null) * * @param string $html */ - public function loadHTML(string $html) + public function loadHTML(string $html): void { $this->logger->debug('[Loading] Loading HTML...'); @@ -378,7 +378,7 @@ private function getJSONLD(DOMDocument $dom): array if ( !isset($parsed['@context']) || !is_string($parsed['@context']) || - !preg_match('/^https?\:\/\/schema\.org$/', $parsed['@context']) + !preg_match('/^https?:\/\/schema\.org$/', $parsed['@context']) ) { return $metadata; } @@ -459,7 +459,7 @@ private function getMetadata() $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i'; // name is a single value - $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; + $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; // Find description tags. foreach ($this->dom->getElementsByTagName('meta') as $meta) { @@ -576,10 +576,8 @@ private function getMetadata() /** * Returns all the images of the parsed article. - * - * @return array */ - public function getImages() + public function getImages(): array { $result = []; if ($this->getImage()) { @@ -598,13 +596,15 @@ public function getImages() if ($this->configuration->getFixRelativeURLs()) { foreach ($result as &$imgSrc) { - $imgSrc = $this->toAbsoluteURI($imgSrc); + try { + $imgSrc = $this->toAbsoluteURI($imgSrc); + } catch (\Exception $err) { + // Invalid URL, leave as is + } } } - $result = array_unique(array_filter($result)); - - return $result; + return array_unique(array_filter($result)); } /** @@ -634,7 +634,11 @@ public function getMainImage() } if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) { - $this->setImage($this->toAbsoluteURI($imgUrl)); + try { + $this->setImage($this->toAbsoluteURI($imgUrl)); + } catch (\Exception $err) { + // Invalid URL, leave as is + } } } @@ -651,6 +655,7 @@ private function simplifyNestedElements(DOMDocument $article) while ($node) { if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) { + /** @var DOMElement $node */ if ($node->isElementWithoutContent()) { $node = NodeUtility::removeAndGetNext($node); continue; @@ -702,9 +707,9 @@ private function getArticleTitle() * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false * I can assure you it works properly if you let the code run. */ - if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { + if (preg_match('/ [|\-\\\\\/>»] /i', $curTitle)) { $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); - $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); + $curTitle = preg_replace('/(.*)[|\-\\\\\/>»] .*/i', '$1', $originalTitle); $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); @@ -787,7 +792,7 @@ private function toAbsoluteURI($uri) $uri = trim($uri); // If this is already an absolute URI, return it. - if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { + if (preg_match('/^[a-zA-Z][a-zA-Z0-9+\-.]*:/', $uri)) { return $uri; } @@ -955,6 +960,7 @@ private function getNodes($node) $p->appendChild($childNode); } } elseif ($p !== null) { + /** @var DOMNode $p */ while ($p->lastChild && $p->lastChild->isWhitespace()) { $p->removeChild($p->lastChild); } @@ -1210,7 +1216,7 @@ private function removeScripts(DOMDocument $dom) * * @param DOMDocument $dom */ - private function prepDocument(DOMDocument $dom) + private function prepDocument(DOMDocument $dom): void { $this->logger->info('[PrepDocument] Preparing document for parsing...'); @@ -1244,6 +1250,7 @@ private function prepDocument(DOMDocument $dom) */ if ($replaced) { + /** @var DOMElement $p */ $p = $dom->createElement('p'); $br->parentNode->replaceChild($p, $br); @@ -1682,7 +1689,7 @@ public function prepArticle(DOMDocument $article) // Remove single-cell tables foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { - /** @var DOMNode $table */ + /** @var DOMElement $table */ $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table; if ($tbody->hasSingleTagInsideElement('tr')) { $row = $tbody->getFirstElementChild(); @@ -2233,20 +2240,32 @@ public function postProcessContent(DOMDocument $article) if ($src) { $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); - $media->setAttribute('src', $this->toAbsoluteURI($src)); + try { + $media->setAttribute('src', $this->toAbsoluteURI($src)); + } catch (\Exception $err) { + $this->logger->debug('[PostProcess] Invalid URL encountered'); + } } if ($poster) { $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($poster, 0, 128))); - $media->setAttribute('poster', $this->toAbsoluteURI($poster)); + try { + $media->setAttribute('poster', $this->toAbsoluteURI($poster)); + } catch (\Exception $err) { + $this->logger->debug('[PostProcess] Invalid URL encountered'); + } + } if ($srcset) { $newSrcset = preg_replace_callback(NodeUtility::$regexps['srcsetUrl'], function ($matches) { $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($matches[1], 0, 128))); - - return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3]; + try { + return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3]; + } catch (\Exception $err) { + return $matches[1] . $matches[2] . $matches[3]; + } }, $srcset); $media->setAttribute('srcset', $newSrcset); @@ -2267,8 +2286,8 @@ public function postProcessContent(DOMDocument $article) * Iterate over a NodeList, and return the first node that passes * the supplied test function * - * @param NodeList nodeList The NodeList. - * @param Function fn The test function. + * @param array nodeList The NodeList. + * @param callable fn The test function. * @return DOMNode|null */ private function findNode(array $nodeList, callable $fn)