00001 <?php
00002
00023 class Services_Yadis_ParseHTML {
00024
00028 var $_re_flags = "si";
00029
00033 var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
00034
00038 var $_close_tag_expr = "<\/?%s\s*>";
00039
00043 var $_removed_re =
00044 "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
00045
00049 var $_attr_find = '\b([-\w]+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
00050
00051 function Services_Yadis_ParseHTML()
00052 {
00053 $this->_meta_find = sprintf("/<meta\b(?!:)([^>]*)(?!<)>/%s",
00054 $this->_re_flags);
00055
00056 $this->_removed_re = sprintf("/%s/%s",
00057 $this->_removed_re,
00058 $this->_re_flags);
00059
00060 $this->_attr_find = sprintf("/%s/%s",
00061 $this->_attr_find,
00062 $this->_re_flags);
00063
00064 $this->_entity_replacements = array(
00065 'amp' => '&',
00066 'lt' => '<',
00067 'gt' => '>',
00068 'quot' => '"'
00069 );
00070
00071 $this->_ent_replace =
00072 sprintf("&(%s);", implode("|",
00073 $this->_entity_replacements));
00074 }
00075
00085 function replaceEntities($str)
00086 {
00087 foreach ($this->_entity_replacements as $old => $new) {
00088 $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
00089 }
00090
00091
00092
00093 $str = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $str);
00094 $str = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $str);
00095
00096 return $str;
00097 }
00098
00108 function removeQuotes($str)
00109 {
00110 $matches = array();
00111 $double = '/^"(.*)"$/';
00112 $single = "/^\'(.*)\'$/";
00113
00114 if (preg_match($double, $str, $matches)) {
00115 return $matches[1];
00116 } else if (preg_match($single, $str, $matches)) {
00117 return $matches[1];
00118 } else {
00119 return $str;
00120 }
00121 }
00122
00134 function tagMatcher($tag_name, $close_tags = null)
00135 {
00136 if ($close_tags) {
00137 $options = implode("|", array_merge(array($tag_name), $close_tags));
00138 $closer = sprintf("(?:%s)", $options);
00139 } else {
00140 $closer = $tag_name;
00141 }
00142
00143 $expr = sprintf($this->_tag_expr, $tag_name, $closer);
00144 return sprintf("/%s/%s", $expr, $this->_re_flags);
00145 }
00146
00150 function htmlFind($str)
00151 {
00152 return $this->tagMatcher('html', array('body'));
00153 }
00154
00158 function headFind()
00159 {
00160 return $this->tagMatcher('head', array('body'));
00161 }
00162
00174 function getMetaTags($html_string)
00175 {
00176 $stripped = preg_replace($this->_removed_re,
00177 "",
00178 $html_string);
00179
00180
00181 $body_closer = sprintf($this->_close_tag_expr, 'body');
00182 $body_matches = array();
00183 preg_match($body_closer, $html_string, $body_matches,
00184 PREG_OFFSET_CAPTURE);
00185 if ($body_matches) {
00186 $html_string = substr($html_string, 0, $body_matches[0][1]);
00187 }
00188
00189
00190
00191 $body_re = $this->tagMatcher('body');
00192 $body_matches = array();
00193 preg_match($body_re, $html_string, $body_matches, PREG_OFFSET_CAPTURE);
00194 if ($body_matches) {
00195 $html_string = substr($html_string, 0, $body_matches[0][1]);
00196 }
00197
00198
00199
00200
00201 $html_re = $this->tagMatcher('html', array('body'));
00202 preg_match($html_re, $html_string, $html_matches);
00203 if ($html_matches) {
00204 $html = $html_matches[0];
00205 } else {
00206 $html = $html_string;
00207 }
00208
00209
00210 $head_re = $this->headFind();
00211 $head_matches = array();
00212 if (!preg_match($head_re, $html, $head_matches)) {
00213 return array();
00214 }
00215
00216 $link_data = array();
00217 $link_matches = array();
00218
00219 if (!preg_match_all($this->_meta_find, $head_matches[0],
00220 $link_matches)) {
00221 return array();
00222 }
00223
00224 foreach ($link_matches[0] as $link) {
00225 $attr_matches = array();
00226 preg_match_all($this->_attr_find, $link, $attr_matches);
00227 $link_attrs = array();
00228 foreach ($attr_matches[0] as $index => $full_match) {
00229 $name = $attr_matches[1][$index];
00230 $value = $this->replaceEntities(
00231 $this->removeQuotes($attr_matches[2][$index]));
00232
00233 $link_attrs[strtolower($name)] = $value;
00234 }
00235 $link_data[] = $link_attrs;
00236 }
00237
00238 return $link_data;
00239 }
00240
00251 function getHTTPEquiv($html_string)
00252 {
00253 $meta_tags = $this->getMetaTags($html_string);
00254
00255 if ($meta_tags) {
00256 foreach ($meta_tags as $tag) {
00257 if (array_key_exists('http-equiv', $tag) &&
00258 (in_array(strtolower($tag['http-equiv']),
00259 array('x-xrds-location', 'x-yadis-location'))) &&
00260 array_key_exists('content', $tag)) {
00261 return $tag['content'];
00262 }
00263 }
00264 }
00265
00266 return null;
00267 }
00268 }
00269
00270 ?>