• Main Page
  • Related Pages
  • Namespaces
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

cms/openid/Services/Yadis/ParseHTML.php

Go to the documentation of this file.
00001 <?php
00002 
00023 class Services_Yadis_ParseHTML {
00024 
00028     var $_re_flags = "si";
00029 
00033     var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
00034 
00038     var $_close_tag_expr = "<\/?%s\s*>";
00039 
00043     var $_removed_re =
00044            "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
00045 
00049     var $_attr_find = '\b([-\w]+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
00050 
00051     function Services_Yadis_ParseHTML()
00052     {
00053         $this->_meta_find = sprintf("/<meta\b(?!:)([^>]*)(?!<)>/%s",
00054                                     $this->_re_flags);
00055 
00056         $this->_removed_re = sprintf("/%s/%s",
00057                                      $this->_removed_re,
00058                                      $this->_re_flags);
00059 
00060         $this->_attr_find = sprintf("/%s/%s",
00061                                     $this->_attr_find,
00062                                     $this->_re_flags);
00063 
00064         $this->_entity_replacements = array(
00065                                             'amp' => '&',
00066                                             'lt' => '<',
00067                                             'gt' => '>',
00068                                             'quot' => '"'
00069                                             );
00070 
00071         $this->_ent_replace =
00072             sprintf("&(%s);", implode("|",
00073                                       $this->_entity_replacements));
00074     }
00075 
00085     function replaceEntities($str)
00086     {
00087         foreach ($this->_entity_replacements as $old => $new) {
00088             $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
00089         }
00090 
00091         // Replace numeric entities because html_entity_decode doesn't
00092         // do it for us.
00093         $str = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $str);
00094         $str = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $str);
00095 
00096         return $str;
00097     }
00098 
00108     function removeQuotes($str)
00109     {
00110         $matches = array();
00111         $double = '/^"(.*)"$/';
00112         $single = "/^\'(.*)\'$/";
00113 
00114         if (preg_match($double, $str, $matches)) {
00115             return $matches[1];
00116         } else if (preg_match($single, $str, $matches)) {
00117             return $matches[1];
00118         } else {
00119             return $str;
00120         }
00121     }
00122 
00134     function tagMatcher($tag_name, $close_tags = null)
00135     {
00136         if ($close_tags) {
00137             $options = implode("|", array_merge(array($tag_name), $close_tags));
00138             $closer = sprintf("(?:%s)", $options);
00139         } else {
00140             $closer = $tag_name;
00141         }
00142 
00143         $expr = sprintf($this->_tag_expr, $tag_name, $closer);
00144         return sprintf("/%s/%s", $expr, $this->_re_flags);
00145     }
00146 
00150     function htmlFind($str)
00151     {
00152         return $this->tagMatcher('html', array('body'));
00153     }
00154 
00158     function headFind()
00159     {
00160         return $this->tagMatcher('head', array('body'));
00161     }
00162 
00174     function getMetaTags($html_string)
00175     {
00176         $stripped = preg_replace($this->_removed_re,
00177                                  "",
00178                                  $html_string);
00179 
00180         // Look for the closing body tag.
00181         $body_closer = sprintf($this->_close_tag_expr, 'body');
00182         $body_matches = array();
00183         preg_match($body_closer, $html_string, $body_matches,
00184                    PREG_OFFSET_CAPTURE);
00185         if ($body_matches) {
00186             $html_string = substr($html_string, 0, $body_matches[0][1]);
00187         }
00188 
00189         // Look for the opening body tag, and discard everything after
00190         // that tag.
00191         $body_re = $this->tagMatcher('body');
00192         $body_matches = array();
00193         preg_match($body_re, $html_string, $body_matches, PREG_OFFSET_CAPTURE);
00194         if ($body_matches) {
00195             $html_string = substr($html_string, 0, $body_matches[0][1]);
00196         }
00197 
00198         // If an HTML tag is found at all, it must be in the right
00199         // order; else, it may be missing (which is a case we allow
00200         // for).
00201         $html_re = $this->tagMatcher('html', array('body'));
00202         preg_match($html_re, $html_string, $html_matches);
00203         if ($html_matches) {
00204             $html = $html_matches[0];
00205         } else {
00206             $html = $html_string;
00207         }
00208 
00209         // Try to find the <HEAD> tag.
00210         $head_re = $this->headFind();
00211         $head_matches = array();
00212         if (!preg_match($head_re, $html, $head_matches)) {
00213             return array();
00214         }
00215 
00216         $link_data = array();
00217         $link_matches = array();
00218 
00219         if (!preg_match_all($this->_meta_find, $head_matches[0],
00220                             $link_matches)) {
00221             return array();
00222         }
00223 
00224         foreach ($link_matches[0] as $link) {
00225             $attr_matches = array();
00226             preg_match_all($this->_attr_find, $link, $attr_matches);
00227             $link_attrs = array();
00228             foreach ($attr_matches[0] as $index => $full_match) {
00229                 $name = $attr_matches[1][$index];
00230                 $value = $this->replaceEntities(
00231                               $this->removeQuotes($attr_matches[2][$index]));
00232 
00233                 $link_attrs[strtolower($name)] = $value;
00234             }
00235             $link_data[] = $link_attrs;
00236         }
00237 
00238         return $link_data;
00239     }
00240 
00251     function getHTTPEquiv($html_string)
00252     {
00253         $meta_tags = $this->getMetaTags($html_string);
00254 
00255         if ($meta_tags) {
00256             foreach ($meta_tags as $tag) {
00257                 if (array_key_exists('http-equiv', $tag) &&
00258                     (in_array(strtolower($tag['http-equiv']),
00259                               array('x-xrds-location', 'x-yadis-location'))) &&
00260                     array_key_exists('content', $tag)) {
00261                     return $tag['content'];
00262                 }
00263             }
00264         }
00265 
00266         return null;
00267     }
00268 }
00269 
00270 ?>

Generated on Sun Jan 2 2011 04:55:32 for Pragyan CMS by  doxygen 1.7.1