1 /** 2 * Copyright: 3 * (C) 2016 Martin Brzenska 4 * 5 * License: 6 * Distributed under the terms of the MIT license. 7 * Consult the provided LICENSE.md file for details 8 */ 9 module libdominator.Dominator; 10 11 import std.conv : to; 12 13 import libdominator.Attribute; 14 import libdominator.Node; 15 16 version(unittest) { 17 import libdominator.Filter; 18 import std.file; 19 } 20 21 struct comment 22 { 23 size_t begin; 24 size_t end; 25 } 26 27 struct terminator 28 { 29 size_t position; 30 size_t length; 31 } 32 33 pragma(inline,true): 34 bool isBetween(in size_t needle, in size_t from, in size_t to) 35 { 36 return (needle > from && needle < to); 37 } 38 39 ///Parse, hierarchize, analyse xhtml 40 class Dominator 41 { 42 private string haystack; 43 private size_t needle; 44 private comment[] comments; 45 46 private Node[] nodes; 47 48 /** 49 Instantiate empty Dominator 50 */ 51 this() {} 52 53 /** 54 Instantiate object and load the Document 55 */ 56 this(string haystack) 57 { 58 this.load(haystack); 59 } 60 61 /** 62 loads a Document 63 Params: 64 haystack = the Document String 65 */ 66 public Dominator load(string haystack) 67 { 68 this.haystack = haystack; 69 this.parse(); 70 return this; 71 } 72 73 private bool tryElementOpener(ref Node node, ref size_t needle) { 74 import std.ascii : isWhite , isAlphaNum , isAlpha; 75 enum ParserStates : ubyte { 76 name = 1, 77 key = 2, 78 value = 4, 79 err = 8, 80 ready = 16 81 } 82 ubyte state = 0; 83 84 char inQuote = 0x00; 85 size_t nameCoord; 86 size_t[2] keyCoord, valCoord; 87 88 if(this.haystack[needle] != '<') { 89 return false; 90 } 91 node.setStartPosition(needle); 92 needle++; 93 94 /* 95 * parse the elements name 96 */ 97 //first, skip whitespaces 98 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 99 100 //The name begins with a underscore or a alphabetical character. 101 if( 102 ! this.haystack[needle].isAlpha 103 && ! this.haystack[needle] == '_' 104 ) { 105 return false; 106 } 107 nameCoord = needle; 108 109 //The name can contain letters, digits, hyphens, underscores, and periods 110 for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) { 111 if( 112 ! this.haystack[needle].isAlphaNum 113 && this.haystack[needle] != '-' 114 && this.haystack[needle] != '_' 115 && this.haystack[needle] != '.' 116 && this.haystack[needle] != ':' 117 ) { 118 if(this.haystack[needle] == '>') { 119 state |= ParserStates.ready; 120 break; 121 } else { 122 return false; 123 } 124 } 125 } 126 node.setTag(this.haystack[nameCoord..needle]); 127 state |= ParserStates.name; 128 129 /* 130 * Parse attributes 131 */ 132 while( ! (state & ParserStates.ready)) 133 { 134 //reset state 135 state &= ~(ParserStates.key | ParserStates.value); 136 137 //Check if the next non-whitespace char finishes our job here 138 while(needle < this.haystack.length && this.haystack[needle].isWhite){ needle++; } 139 if(this.haystack[needle] == '>') { 140 state |= ParserStates.ready; 141 break; 142 } 143 /* 144 * Find the attr-key 145 */ 146 keyCoord[0] = needle; 147 for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) { 148 if(this.haystack[needle] == '>') { 149 state |= ParserStates.ready; 150 break; 151 } 152 if(this.haystack[needle] == '=') { 153 break; 154 } 155 } 156 keyCoord[1] = needle; 157 158 if(state & ParserStates.ready) { 159 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) ); 160 } 161 else { 162 /** 163 * Parse attribute value if exist 164 */ 165 166 //skip whitespaces 167 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 168 169 if(this.haystack[needle] == '>') { 170 //there is no value and this is the end 171 state |= ParserStates.ready; 172 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) ); 173 break; 174 } 175 else if(this.haystack[needle] == '=') 176 { 177 //here comes the value... 178 needle++; 179 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 180 if(this.haystack[needle] == '"' || this.haystack[needle] == '\'' ) { 181 //quoted value 182 inQuote = this.haystack[needle]; 183 needle++; 184 valCoord[0] = needle; 185 while( 186 needle < this.haystack.length 187 && (this.haystack[needle] != inQuote 188 || ( this.haystack[needle] == inQuote && this.haystack[needle-1] == '\\')) 189 ) { 190 needle++; 191 } 192 valCoord[1] = needle; 193 needle++; 194 } 195 else { 196 //not quoted value 197 inQuote = 0x00; 198 valCoord[0] = needle; 199 while( 200 needle < this.haystack.length 201 && (!this.haystack[needle].isWhite 202 || ( this.haystack[needle].isWhite && this.haystack[needle-1] == '\\')) 203 ) { 204 if(this.haystack[needle] == '>') { 205 state |= ParserStates.ready; 206 break; 207 } 208 needle++; 209 } 210 if(state & ParserStates.ready) { 211 valCoord[1] = needle; 212 } 213 else { 214 valCoord[1] = needle; 215 needle++; 216 } 217 } 218 } 219 else { 220 //there is no value 221 } 222 if(keyCoord[1] >= this.haystack.length || valCoord[1] >= this.haystack.length) { 223 return false; 224 } 225 node.addAttribute(Attribute( 226 this.haystack[keyCoord[0]..keyCoord[1]], 227 this.haystack[valCoord[0]..valCoord[1]] 228 )); 229 } 230 } 231 node.setStartTagLength( 1 + needle - node.getStartPosition() ); 232 return true; 233 } 234 235 private bool tryElementTerminator(ref terminator[][string] terminators , ref size_t needle) { 236 import std.ascii : isWhite; 237 238 if( this.haystack[needle] != '<' ) {return false;} 239 240 size_t[2] nameCoord; 241 size_t position = needle; 242 243 for(needle = needle + 1; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {} 244 if(this.haystack[needle] != '/') { return false; } 245 nameCoord[0] = 1 + needle; 246 for( 247 needle = needle + 1 ; 248 needle < this.haystack.length 249 && !this.haystack[needle].isWhite ; 250 needle++ 251 ) { 252 if(this.haystack[needle] == '>') { 253 terminators[this.haystack[nameCoord[0]..needle]] ~= terminator(position , 1 + needle - position); 254 return true; 255 } 256 } 257 nameCoord[1] = needle; 258 for(; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {} 259 if(this.haystack[needle] == '>') { 260 terminators[this.haystack[nameCoord[0]..nameCoord[1]]] ~= terminator(position , 1 + needle - position); 261 return true; 262 } 263 return false; 264 } 265 private bool tryCommentOpener(ref size_t needle) { 266 if( 267 needle + 4 < this.haystack.length 268 && this.haystack[needle..needle+4] == "<!--" 269 ) { 270 needle = needle+4; 271 return true; 272 } 273 return false; 274 } 275 private bool tryCommentTerminator(ref size_t needle) { 276 if( 277 needle + 3 < this.haystack.length 278 && this.haystack[needle..needle+3] == "-->" 279 ) { 280 needle = needle+3; 281 return true; 282 } 283 return false; 284 } 285 private void parse() 286 { 287 import std.ascii : isWhite , isAlphaNum , isAlpha; 288 import std.array : appender; 289 290 //Reset parsed state 291 this.nodes = []; 292 this.comments = []; 293 294 if(this.haystack.length == 0) { return; } 295 Node[] nodes; 296 auto nodeAppender = appender(nodes); 297 terminator[][string] terminators; 298 size_t needleProbe; 299 enum ParserStates : ubyte { 300 inElementOpener = 1, 301 inComment = 2 302 } 303 ubyte state; 304 size_t needle; 305 306 do { 307 if(this.haystack[needle] == '<') { 308 309 needleProbe = needle; 310 if( this.tryElementTerminator(terminators , needleProbe) ) { 311 needle = needleProbe; 312 continue; 313 } 314 315 needleProbe = needle; 316 if( 317 !(state & ParserStates.inComment) 318 && this.tryCommentOpener(needleProbe) 319 ) { 320 needle = needleProbe; 321 state |= ParserStates.inComment; 322 continue; 323 } 324 325 needleProbe = needle; 326 Node elem = new Node(); 327 if( this.tryElementOpener(elem , needleProbe) ) { 328 329 330 needle = needleProbe; 331 if(state & ParserStates.inComment) {elem.isComment(true);} 332 nodeAppender.put(elem); 333 continue; 334 } 335 336 } 337 else if( 338 this.haystack[needle] == '-' 339 && state & ParserStates.inComment 340 ) { 341 needleProbe = needle; 342 if( this.tryCommentTerminator(needleProbe) ) { 343 needle = needleProbe; 344 state &= ~ParserStates.inComment; 345 continue; 346 } 347 } 348 349 needle++; 350 } while(needle < this.haystack.length); 351 352 this.nodes = nodeAppender.data; 353 this.hierarchize(terminators); 354 } 355 356 private void hierarchize(terminator[][string] terminators) 357 { 358 import std.algorithm.sorting : sort; 359 360 bool[size_t] arrTerminatorBlacklist; 361 foreach_reverse (Node node; this.nodes) 362 { 363 terminator _lastTerm = terminator(node.getStartPosition(), 0); 364 bool isTerminated = false; 365 366 if(node.getTag() in terminators) { 367 foreach_reverse (terminator terminatorCandi; terminators[node.getTag()]) 368 { 369 if (terminatorCandi.position in arrTerminatorBlacklist) 370 { 371 //skip if already checked and marked as a false candidate 372 continue; 373 } 374 if (node.getStartPosition() > terminatorCandi.position) 375 { 376 /* 377 * The candidates position is lower then the position of the node, for which we are searching the terminator. 378 * This means, that the last candidate, that we have checked, was the right one - if there was one. 379 */ 380 arrTerminatorBlacklist[_lastTerm.position] = true; 381 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length); 382 isTerminated = true; 383 break; 384 } 385 else 386 { 387 _lastTerm = terminatorCandi; 388 } 389 } 390 } 391 if (!isTerminated) 392 { 393 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length); 394 arrTerminatorBlacklist[_lastTerm.position] = true; 395 } 396 if (node.getEndTagLength == 0) 397 { 398 //No Terminator has been found - we assume, that this is a self-terminator 399 node.setEndTagLength(node.getStartTagLength()); 400 } 401 } 402 403 this.nodes.sort!("a.getStartPosition() < b.getStartPosition()"); 404 405 foreach (size_t i, Node node; this.nodes) 406 { 407 for (size_t back = 1; back <= i; back++) 408 { 409 if (this.nodes[i - back].getEndPosition() > node.getEndPosition()) 410 { 411 node.setParent(&this.nodes[i - back]); 412 node.getParent().addChild(&this.nodes[i]); 413 break; 414 } 415 } 416 } 417 } 418 419 /** 420 returns all found Nodes. 421 Please note, that also Nodes will be returned which was found in comments. 422 use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments() 423 424 returns: 425 Nodes[] 426 */ 427 public Node[] getNodes() 428 { 429 return this.nodes; 430 } 431 432 /** 433 gets the Tag Name of the Node 434 435 Params: 436 node = The Node to get the Tag Name from 437 438 Returns: 439 The Tag Name as string 440 */ 441 public string getStartElement(Node node) 442 { 443 return this.haystack[node.getStartPosition() .. ( 444 node.getStartPosition() + node.getStartTagLength())]; 445 } 446 447 /** 448 gets the part of the loaded Document from the nodes begining to its end 449 450 Params: 451 node = The Node from which you want to get the Document representation 452 */ 453 public string getElelment(Node node) 454 { 455 return this.haystack[node.getStartPosition() .. ( 456 node.getEndPosition() + node.getEndTagLength())]; 457 } 458 459 /** 460 * gets the Inner-HTML from the given node 461 * 462 * Params: 463 * node = The Node from which you want to get the Inner-HTML 464 */ 465 public string getInner(Node node) 466 { 467 return (node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength())) ? this.haystack[( 468 node.getStartPosition() + node.getStartTagLength()) .. (node.getEndPosition())] : ""; 469 } 470 471 /** 472 * Removes tags and returns plain inner content 473 */ 474 public string stripTags(Node node) { 475 import std.algorithm.searching : any; 476 import std.array : appender; 477 auto inner = appender!string(); 478 Node[] descendants = node.getDescendants(); 479 for(size_t i = node.getStartPosition + node.getStartTagLength ; i < node.getEndPosition ; i++) { 480 if( ! 481 any!(desc => 482 isBetween(i , desc.getStartPosition()-1 , desc.getStartPosition()+desc.getStartTagLength()) 483 || isBetween(i , desc.getEndPosition()-1 , desc.getEndPosition()+desc.getEndTagLength()) 484 )(descendants) 485 ) { 486 inner.put(this.haystack[i]); 487 } 488 } 489 return inner.data; 490 } 491 492 /** 493 * Removes tags and returns plain inner content 494 */ 495 public string stripTags() { 496 if( ! this.nodes.length) { 497 return ""; 498 } 499 return this.stripTags((*this.nodes.ptr)); 500 } 501 /// 502 unittest { 503 const string content = `<div><h2>bla</h2><p>fasel</p></div>`; 504 Dominator dom = new Dominator(content); 505 assert( dom.stripTags() == "blafasel", dom.stripTags()); 506 } 507 } 508 509 unittest { 510 const string content = `<div data-function=">"> 511 <ol id="ol-1"> 512 <li id="li-1-ol-1">li-1-ol-1 Inner</li> 513 <li id="li-2-ol-1">li-2-ol-1 Inner</li> 514 <li id="li-3-ol-1">li-3-ol-1 Inner</li> 515 </ol> 516 </div>`; 517 Dominator dom = new Dominator(content); 518 assert( dom.getNodes.length == 5); 519 assert( dom.filterDom(DomFilter("ol")).length == 1 ); 520 assert( dom.filterDom(DomFilter("ol.li")).length == 3 ); 521 assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 ); 522 } 523 524 /// get descendants of a specific Node and apply further filtering on the result. 525 unittest { 526 const string content = `<div data-function="<some weird> stuff"> 527 <span> 528 <span> 529 <span>bäm!</span> 530 </span> 531 <span>boing!</span> 532 </span> 533 <ol id="ol-1"> 534 <li id="li-1-ol-1">li-1-ol-1 Inner</li> 535 <li id="li-2-ol-1">li-2-ol-1 Inner</li> 536 <li id="li-3-ol-1">li-3-ol-1 Inner</li> 537 </ol> 538 </div>`; 539 540 Dominator dom = new Dominator(content); 541 Node [] descendants = (*dom.filterDom("div").ptr).getDescendants(); 542 assert( descendants.filterDom("span").length == 4 ); 543 assert( descendants.filterDom("li").length == 3 ); 544 assert( descendants.filterDom("ol").length == 1 ); 545 } 546 547 /// basic example 548 unittest { 549 const string html = 550 `<div> 551 <p>Here comes a list!</p> 552 <ul> 553 <li class="wanted">one</li> 554 <!-- <li>two</li> --> 555 <li class="wanted hard">three</li> 556 <li id="item-4">four</li> 557 <li checked>five</li> 558 <li id="item-6">six</li> 559 </ul> 560 <p>another list</p> 561 <ol> 562 <li>eins</li> 563 <li>zwei</li> 564 <li>drei</li> 565 </ol> 566 <p>have a nice day</p> 567 </div>`; 568 Dominator dom = new Dominator(html); 569 570 foreach(node ; dom.filterDom("ul.li")) { 571 //do something more usefull with the node then: 572 assert(node.getParent.getTag() == "ul"); 573 } 574 575 Node[] nodes = dom.filterDom("ul.li"); 576 assert(dom.getInner( nodes[0] ) == "one" ); 577 assert(nodes[0].getAttributes() == [ Attribute("class","wanted") ] , to!(string)(nodes[0].getAttributes()) ); 578 assert(Attribute("class","wanted").matches(nodes[0])); 579 assert(Attribute("class","wanted").matches(nodes[2])); 580 assert(Attribute("class",["wanted","hard"]).matches(nodes[2])); 581 assert(nodes[1].isComment()); 582 583 assert(dom.filterDom("ul.li").length == 6); 584 assert(dom.filterDom("ul.li").filterComments.length == 5); 585 assert(dom.filterDom("li").length == 9); 586 assert(dom.filterDom("li[1]").length == 1); //the first li in the dom 587 assert(dom.filterDom("*.li[1]").length == 2); //the first li in ul and first li in ol 588 assert(dom.getInner( (*dom.filterDom("*{checked:}").ptr) ) == "five"); 589 590 } 591 592 unittest { 593 Dominator dom = new Dominator(readText("dummy.html")); 594 auto filter = DomFilter("article"); 595 assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length)); 596 assert( dom.filterDom(filter).length == 6); 597 598 assert( dom.filterDom("div.*.ol.li").length == 3 ); 599 assert( dom.filterDom("div.ol.li").length == 6 ); 600 assert( dom.filterDom("ol.li").length == 6 ); 601 assert( dom.filterDom(`ol.li{id:(regex)^li-[\d]+}`).length == 6 ); 602 assert( dom.filterDom(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`).length == 3 ); 603 assert( dom.filterDom(`*{checked:}`).length == 1 ); 604 assert( dom.filterDom(`onelinenested`).length == 2 ); 605 assert( dom.filterDom(`onelinenested{class:level1}`).length == 1 ); 606 assert( dom.filterDom(`onelinenested{class:level2}`).length == 1 ); 607 assert( dom.filterDom(`onelinenested.onelinenested`).length == 1 ); 608 609 /** 610 * Find nodes with a special href. 611 */ 612 filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`); 613 assert( dom.filterDom(filter).length); 614 foreach(Node foundNode ; dom.filterDom(filter)) { 615 assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) ); 616 } 617 618 /** 619 * Find nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks. 620 */ 621 filter = DomFilter(`*{href://www.google.com/}`); 622 assert( dom.filterDom(filter).length); 623 foreach(Node foundNode ; dom.filterDom(filter)) { 624 assert( Attribute("href","//www.google.com/").matches(foundNode) ); 625 } 626 }