1 /** 2 * Copyright: 3 * (C) 2016 Martin Brzenska 4 * 5 * License: 6 * Distributed under the terms of the MIT license. 7 * Consult the provided LICENSE.md file for details 8 */ 9 module libdominator.Dominator; 10 11 import std.regex : Regex , regex, matchAll , ctRegex; 12 import std.conv : to; 13 14 import libdominator.Attribute; 15 import libdominator.Node; 16 17 static Regex!char rNodeHead; 18 static Regex!char rAttrib; 19 static Regex!char rComment; 20 static this() { 21 rNodeHead = ctRegex!(`<[\s]*([\w\d-]+)`, "i"); 22 rAttrib = ctRegex!(`([\w\d-_]+)(?:=((")(?:\\"|[^"])*"|(')(?:\\'|[^'])*'|(?:\\[\s]|[^\s])*([\s])*))?`, "i"); 23 rComment = ctRegex!(`<!--.*?-->`, "s"); 24 } 25 26 struct comment 27 { 28 uint begin; 29 uint end; 30 } 31 32 struct terminator 33 { 34 uint position; 35 ushort length; 36 } 37 38 bool isBetween(size_t needle, size_t from, size_t to) 39 { 40 return (needle > from && needle < to); 41 } 42 43 ///Parse, hierarchize, analyse xhtml 44 class Dominator 45 { 46 //private auto rComment = regex(`<!--.*?-->`, "s"); 47 private string haystack; 48 private comment[] comments; 49 50 private Node[] nodes; 51 52 /** 53 Instantiate empty Dominator 54 */ 55 this() {} 56 57 /** 58 Instantiate object and load the Document 59 */ 60 this(string haystack) 61 { 62 this.load(haystack); 63 } 64 65 /** 66 loads a Document 67 Params: 68 haystack = the Document String 69 */ 70 public Dominator load(string haystack) 71 { 72 this.haystack = haystack; 73 this.parse(); 74 return this; 75 } 76 77 private size_t findNodesLenth(size_t nodeHeadPos) { 78 size_t len; 79 bool hasQuoteMarkFailure; 80 bool openSingleQuoteMark; 81 bool openDoubleQuoteMark; 82 char inQuote = 0x00; 83 while(this.haystack.length > nodeHeadPos + len) 84 { 85 if( 86 this.haystack[nodeHeadPos + len] == '>' 87 && inQuote == 0x00 88 ) { 89 return 1+len; 90 } 91 len++; 92 if( 93 this.haystack[nodeHeadPos + len] == inQuote 94 && this.haystack[nodeHeadPos + len -1] != '\\' 95 ) { 96 inQuote = 0x00; 97 } 98 else if( 99 inQuote == 0x00 100 && (this.haystack[nodeHeadPos + len] == '\'' 101 || this.haystack[nodeHeadPos + len] == '"') 102 ) { 103 inQuote = this.haystack[nodeHeadPos + len]; 104 } 105 } 106 return 1+len; //we should never get here 107 } 108 109 private void parse() 110 { 111 import std.string : chomp, chompPrefix; 112 foreach (mComment; matchAll(this.haystack, rComment)) 113 { 114 this.comments ~= comment(to!uint(mComment.pre().length), 115 to!uint(mComment.pre().length) + to!uint(mComment.front.length)); 116 } 117 terminator[][string] terminators; 118 foreach (mNode; matchAll(this.haystack, rNodeHead)) 119 { 120 auto node = new Node(); 121 /* 122 * We have a good candidate for a opening node. 123 * We know the beginning position (and how long the "head" is) 124 * Next, we need to find out where the opener ends. 125 */ 126 mNode.popFront(); 127 node.setTag(mNode.front) 128 .setStartPosition(mNode.pre().length); 129 node.setStartTagLength( 130 node.getTag().length + this.findNodesLenth( node.getTag().length + node.getStartPosition() ) 131 ); 132 133 //check if this node is inside of a comment - if yes, mark it. 134 foreach (comment cmnt; this.comments) 135 { 136 if (isBetween(node.getStartPosition(), cmnt.begin, cmnt.end)) 137 { 138 node.isComment(true); 139 } 140 } 141 142 //parse the attributes, if there are one or more 143 if( 144 node.getStartPosition + mNode.hit().length + 1 145 < 146 node.getStartPosition + node.getStartTagLength() -1 147 ) 148 { 149 foreach ( 150 mAttrib; 151 matchAll( 152 this.haystack[ 153 node.getStartPosition + mNode.hit().length + 1 154 .. 155 node.getStartPosition + node.getStartTagLength() -1 156 ], 157 rAttrib 158 ) 159 ) 160 { 161 node.addAttribute( 162 Attribute( 163 mAttrib[1], 164 chompPrefix( 165 chomp( 166 mAttrib[2], 167 mAttrib[3] ~ mAttrib[4] ~ mAttrib[5] 168 ), 169 mAttrib[3] ~ mAttrib[4] ~ mAttrib[5] 170 ) 171 ) 172 ); 173 } 174 } 175 this.addNode(node); 176 177 //search Terminator Candidates 178 if (node.getTag() !in terminators) 179 { 180 foreach (mTerminatorCandi; matchAll(this.haystack[node.getStartPosition() .. $], 181 regex(`<[\s]*/` ~ node.getTag() ~ `[\s]*>`,"i"))) 182 { 183 terminators[node.getTag()] ~= terminator(node.getStartPosition() + to!uint(mTerminatorCandi.pre() 184 .length), to!ushort(mTerminatorCandi.front.length)); 185 } 186 if (node.getTag() !in terminators) 187 { 188 terminators[node.getTag()] ~= [terminator(node.getStartPosition(), 0)]; 189 } 190 } 191 } 192 this.hierarchize(terminators); 193 } 194 195 private void addNode(Node node) 196 { 197 this.nodes ~= node; 198 } 199 200 private void hierarchize(terminator[][string] terminators) 201 { 202 import std.algorithm : sort; 203 204 bool[size_t] arrTerminatorBlacklist; 205 foreach_reverse (Node node; this.nodes) 206 { 207 terminator _lastTerm = terminator(node.getStartPosition(), 0); 208 bool isTerminated = false; 209 foreach_reverse (terminator terminatorCandi; terminators[node.getTag()]) 210 { 211 if (terminatorCandi.position in arrTerminatorBlacklist) 212 { 213 continue; 214 } 215 if (node.getStartPosition() > terminatorCandi.position) 216 { 217 arrTerminatorBlacklist[_lastTerm.position] = true; 218 node.setEndPosition(_lastTerm.position).setEndTagLength(0); 219 isTerminated = true; 220 break; 221 } 222 else 223 { 224 _lastTerm = terminatorCandi; 225 } 226 } 227 if (!isTerminated) 228 { 229 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length); 230 arrTerminatorBlacklist[_lastTerm.position] = true; 231 } 232 if (node.getEndTagLength == 0) 233 { 234 //No Terminator has been found - we assume, that this is a self-terminator 235 node.setEndTagLength(node.getStartTagLength()); 236 } 237 } 238 239 Node[] sortedNodes; 240 foreach (node; sort!"a.getStartPosition() < b.getStartPosition()"(this.nodes)) 241 { 242 sortedNodes ~= node; 243 } 244 this.nodes = sortedNodes.dup; 245 delete sortedNodes; 246 247 foreach (size_t i, Node node; this.nodes) 248 { 249 for (size_t back = 1; back <= i; back++) 250 { 251 if (this.nodes[i - back].getEndPosition() > node.getEndPosition()) 252 { 253 node.setParent(&this.nodes[i - back]); 254 node.getParent().addChild(&this.nodes[i]); 255 break; 256 } 257 } 258 } 259 } 260 261 /** 262 returns all found Nodes. 263 Please note, that also Nodes will be returned which was found in comments. 264 use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments() 265 266 returns: 267 Nodes[] 268 */ 269 public Node[] getNodes() 270 { 271 return this.nodes; 272 } 273 274 /** 275 gets the Tag Name of the Node 276 277 Params: 278 node = The Node to get the Tag Name from 279 280 Returns: 281 The Tag Name as string 282 */ 283 public string getStartElement(Node node) 284 { 285 return this.haystack[node.getStartPosition() .. ( 286 node.getStartPosition() + node.getStartTagLength())]; 287 } 288 289 /** 290 gets the part of the loaded Document from the nodes begining to its end 291 292 Params: 293 node = The Node from which you want to get the Document representation 294 */ 295 public string getElelment(Node node) 296 { 297 return this.haystack[node.getStartPosition() .. ( 298 node.getEndPosition() + node.getEndTagLength())]; 299 } 300 301 /** 302 gets the Inner-HTML from the given node 303 304 Params: 305 node = The Node from which you want to get the Inner-HTML 306 */ 307 public string getInner(Node node) 308 { 309 return (node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength())) ? this.haystack[( 310 node.getStartPosition() + node.getStartTagLength()) .. (node.getEndPosition())] : ""; 311 } 312 } 313 314 version(unittest) { 315 import libdominator.Filter; 316 import std.file; 317 } 318 319 unittest { 320 const string content = `<div data-function=">"> 321 <ol id="ol-1"> 322 <li id="li-1-ol-1">li-1-ol-1 Inner</li> 323 <li id="li-2-ol-1">li-2-ol-1 Inner</li> 324 <li id="li-3-ol-1">li-3-ol-1 Inner</li> 325 </ol> 326 </div>`; 327 Dominator dom = new Dominator(content); 328 assert( dom.getNodes.length == 5); 329 assert( dom.filterDom(DomFilter("ol")).length == 1 ); 330 assert( dom.filterDom(DomFilter("ol.li")).length == 3 ); 331 assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 ); 332 } 333 334 unittest { 335 import std.conv : to; 336 Dominator dom = new Dominator(readText("dummy.html")); 337 auto filter = DomFilter("article"); 338 assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length) ); 339 assert( dom.filterDom(filter).length == 6 , to!(string)(dom.filterDom(filter).length)); 340 341 filter = DomFilter("div.*.ol.li"); 342 assert( dom.filterDom(filter).length == 3 ); 343 344 filter = DomFilter("div.ol.li"); 345 assert( dom.filterDom(filter).length == 6 ); 346 347 filter = DomFilter("ol.li"); 348 assert( dom.filterDom(filter).length == 6 ); 349 350 filter = DomFilter(`ol.li{id:(regex)^li-[\d]+}`); 351 assert( dom.filterDom(filter).length == 6 ); 352 353 filter = DomFilter(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`); 354 assert( dom.filterDom(filter).length == 3 ); 355 356 filter = DomFilter(`*{checked:}`); 357 assert( dom.filterDom(filter).length == 1 ); 358 359 filter = DomFilter(`onelinenested`); 360 assert( dom.filterDom(filter).length == 2 ); 361 362 filter = DomFilter(`onelinenested{class:level1}`); 363 assert( dom.filterDom(filter).length == 1 ); 364 365 filter = DomFilter(`onelinenested{class:level2}`); 366 assert( dom.filterDom(filter).length == 1 ); 367 368 filter = DomFilter(`onelinenested.onelinenested`); 369 assert( dom.filterDom(filter).length == 1 ); 370 371 /** 372 * Find the nodes with a special href. 373 */ 374 filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`); 375 assert( dom.filterDom(filter).length); 376 foreach(Node foundNode ; dom.filterDom(filter)) { 377 assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) ); 378 } 379 380 /** 381 * Find the nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks. 382 */ 383 filter = DomFilter(`*{href://www.google.com/}`); 384 assert( dom.filterDom(filter).length); 385 foreach(Node foundNode ; dom.filterDom(filter)) { 386 assert( Attribute("href","//www.google.com/").matches(foundNode) ); 387 } 388 }