1 /** 2 * Copyright: 3 * (C) 2016 Martin Brzenska 4 * 5 * License: 6 * Distributed under the terms of the MIT license. 7 * Consult the provided LICENSE.md file for details 8 */ 9 module libdominator.Dominator; 10 11 import std.conv : to; 12 13 import libdominator.Attribute; 14 import libdominator.Node; 15 16 version(unittest) { 17 import libdominator.Filter; 18 import std.file; 19 } 20 21 struct comment 22 { 23 size_t begin; 24 size_t end; 25 } 26 27 struct terminator 28 { 29 size_t position; 30 size_t length; 31 } 32 33 bool isBetween(in size_t needle, in size_t from, in size_t to) 34 { 35 return (needle > from && needle < to); 36 } 37 38 ///Parse, hierarchize, analyse xhtml 39 class Dominator 40 { 41 private string haystack; 42 private size_t needle; 43 private comment[] comments; 44 45 private Node[] nodes; 46 47 /** 48 Instantiate empty Dominator 49 */ 50 this() {} 51 52 /** 53 Instantiate object and load the Document 54 */ 55 this(string haystack) 56 { 57 this.load(haystack); 58 } 59 60 /** 61 loads a Document 62 Params: 63 haystack = the Document String 64 */ 65 public Dominator load(string haystack) 66 { 67 this.haystack = haystack; 68 this.parse(); 69 return this; 70 } 71 72 private bool tryElementOpener(ref Node node, ref size_t needle) 73 { 74 import std.ascii : isWhite , isAlphaNum , isAlpha; 75 enum ParserStates : ubyte { 76 name = 1, 77 key = 2, 78 value = 4, 79 err = 8, 80 ready = 16 81 } 82 ubyte state = 0; 83 84 char inQuote = 0x00; 85 size_t nameCoord; 86 size_t[2] keyCoord, valCoord; 87 88 if(this.haystack[needle] != '<') { 89 return false; 90 } 91 node.setStartPosition(needle); 92 needle++; 93 94 /* 95 * parse the elements name 96 */ 97 //first, skip whitespaces 98 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 99 100 //The name begins with a underscore or a alphabetical character. 101 if( 102 ! this.haystack[needle].isAlpha 103 && ! this.haystack[needle] == '_' 104 ) { 105 return false; 106 } 107 nameCoord = needle; 108 109 //The name can contain letters, digits, hyphens, underscores, and periods 110 for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) { 111 if( 112 ! this.haystack[needle].isAlphaNum 113 && this.haystack[needle] != '-' 114 && this.haystack[needle] != '_' 115 && this.haystack[needle] != '.' 116 && this.haystack[needle] != ':' 117 ) { 118 if(this.haystack[needle] == '>') { 119 state |= ParserStates.ready; 120 break; 121 } else { 122 return false; 123 } 124 } 125 } 126 node.setTag(this.haystack[nameCoord..needle]); 127 state |= ParserStates.name; 128 129 /* 130 * Parse attributes 131 */ 132 while( ! (state & ParserStates.ready)) 133 { 134 //reset state 135 state &= ~(ParserStates.key | ParserStates.value); 136 137 //Check if the next non-whitespace char finishes our job here 138 while(needle < this.haystack.length && this.haystack[needle].isWhite){ needle++; } 139 if(this.haystack[needle] == '>') { 140 state |= ParserStates.ready; 141 break; 142 } 143 /* 144 * Find the attr-key 145 */ 146 keyCoord[0] = needle; 147 for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) { 148 if(this.haystack[needle] == '>') { 149 state |= ParserStates.ready; 150 break; 151 } 152 if(this.haystack[needle] == '=') { 153 break; 154 } 155 } 156 keyCoord[1] = needle; 157 158 if(state & ParserStates.ready) { 159 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) ); 160 } 161 else { 162 /** 163 * Parse attribute value if exist 164 */ 165 166 //skip whitespaces 167 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 168 169 if(this.haystack[needle] == '>') { 170 //there is no value and this is the end 171 state |= ParserStates.ready; 172 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) ); 173 break; 174 } 175 else if(this.haystack[needle] == '=') 176 { 177 //here comes the value... 178 needle++; 179 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; } 180 if(this.haystack[needle] == '"' || this.haystack[needle] == '\'' ) { 181 //quoted value 182 inQuote = this.haystack[needle]; 183 needle++; 184 valCoord[0] = needle; 185 while( 186 needle < this.haystack.length 187 && (this.haystack[needle] != inQuote 188 || ( this.haystack[needle] == inQuote && this.haystack[needle-1] == '\\')) 189 ) { 190 needle++; 191 } 192 valCoord[1] = needle; 193 needle++; 194 } 195 else { 196 //not quoted value 197 inQuote = 0x00; 198 valCoord[0] = needle; 199 while( 200 needle < this.haystack.length 201 && (!this.haystack[needle].isWhite 202 || ( this.haystack[needle].isWhite && this.haystack[needle-1] == '\\')) 203 ) { 204 if(this.haystack[needle] == '>') { 205 state |= ParserStates.ready; 206 break; 207 } 208 needle++; 209 } 210 if(state & ParserStates.ready) { 211 valCoord[1] = needle; 212 } 213 else { 214 valCoord[1] = needle; 215 needle++; 216 } 217 } 218 } 219 else { 220 //there is no value 221 } 222 if(keyCoord[1] >= this.haystack.length || valCoord[1] >= this.haystack.length) { 223 return false; 224 } 225 node.addAttribute(Attribute( 226 this.haystack[keyCoord[0]..keyCoord[1]], 227 this.haystack[valCoord[0]..valCoord[1]] 228 )); 229 } 230 } 231 node.setStartTagLength( 1 + needle - node.getStartPosition() ); 232 return true; 233 } 234 235 private bool tryElementTerminator(ref terminator[][string] terminators , ref size_t needle) { 236 import std.ascii : isWhite; 237 import std.uni : toLower; 238 if( this.haystack[needle] != '<' ) {return false;} 239 240 size_t[2] nameCoord; 241 size_t position = needle; 242 243 for(needle = needle + 1; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {} 244 if(this.haystack[needle] != '/') { return false; } 245 nameCoord[0] = 1 + needle; 246 for( 247 needle = needle + 1 ; 248 needle < this.haystack.length 249 && !this.haystack[needle].isWhite ; 250 needle++ 251 ) { 252 if(this.haystack[needle] == '>') { 253 terminators[this.haystack[nameCoord[0]..needle].toLower] ~= terminator(position , 1 + needle - position); 254 return true; 255 } 256 } 257 nameCoord[1] = needle; 258 for(; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {} 259 if(this.haystack[needle] == '>') { 260 terminators[this.haystack[nameCoord[0]..nameCoord[1]].toLower] ~= terminator(position , 1 + needle - position); 261 return true; 262 } 263 return false; 264 } 265 266 private bool tryCommentOpener(ref size_t needle) { 267 if( 268 needle + 4 < this.haystack.length 269 && this.haystack[needle..needle+4] == "<!--" 270 ) { 271 needle = needle+4; 272 return true; 273 } 274 return false; 275 } 276 private bool tryCommentTerminator(ref size_t needle) { 277 if( 278 needle + 3 < this.haystack.length 279 && this.haystack[needle..needle+3] == "-->" 280 ) { 281 needle = needle+3; 282 return true; 283 } 284 return false; 285 } 286 private void parse() 287 { 288 import std.ascii : isWhite , isAlphaNum , isAlpha; 289 import std.array : appender; 290 291 //Reset parsed state 292 this.nodes = []; 293 this.comments = []; 294 295 if(this.haystack.length == 0) { return; } 296 Node[] nodes; 297 auto nodeAppender = appender(nodes); 298 terminator[][string] terminators; 299 size_t needleProbe; 300 enum ParserStates : ubyte { 301 inElementOpener = 1, 302 inComment = 2 303 } 304 ubyte state; 305 size_t needle; 306 307 do { 308 if(this.haystack[needle] == '<') { 309 310 needleProbe = needle; 311 if( this.tryElementTerminator(terminators , needleProbe) ) 312 { 313 needle = needleProbe; 314 continue; 315 } 316 317 needleProbe = needle; 318 if( 319 !(state & ParserStates.inComment) 320 && this.tryCommentOpener(needleProbe) 321 ) { 322 needle = needleProbe; 323 state |= ParserStates.inComment; 324 continue; 325 } 326 327 needleProbe = needle; 328 Node elem = new Node(); 329 if( this.tryElementOpener(elem , needleProbe) ) 330 { 331 needle = needleProbe; 332 if(state & ParserStates.inComment) {elem.isComment(true);} 333 nodeAppender.put(elem); 334 continue; 335 } 336 337 } 338 else if( 339 this.haystack[needle] == '-' 340 && state & ParserStates.inComment 341 ) { 342 needleProbe = needle; 343 if( this.tryCommentTerminator(needleProbe) ) { 344 needle = needleProbe; 345 state &= ~ParserStates.inComment; 346 continue; 347 } 348 } 349 350 needle++; 351 } while(needle < this.haystack.length); 352 353 this.nodes = nodeAppender.data; 354 this.hierarchize(terminators); 355 } 356 357 private void hierarchize(terminator[][string] terminators) 358 { 359 import std.algorithm.sorting : sort; 360 import std.uni : toLower; 361 362 bool[size_t] arrTerminatorBlacklist; 363 foreach_reverse (Node node; this.nodes) 364 { 365 terminator _lastTerm = terminator(node.getStartPosition(), 0); 366 bool isTerminated = false; 367 368 if(node.getTag().toLower in terminators) { 369 foreach_reverse (terminator terminatorCandi; terminators[ node.getTag().toLower ]) 370 { 371 if (terminatorCandi.position in arrTerminatorBlacklist) 372 { 373 //skip if already checked and marked as a false candidate 374 continue; 375 } 376 if (node.getStartPosition() > terminatorCandi.position) 377 { 378 /* 379 * The candidates position is lower then the position of the node, for which we are searching the terminator. 380 * This means, that the last candidate, that we have checked, was the right one - if there was one. 381 */ 382 arrTerminatorBlacklist[_lastTerm.position] = true; 383 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length); 384 isTerminated = true; 385 break; 386 } 387 else 388 { 389 _lastTerm = terminatorCandi; 390 } 391 } 392 } 393 if (!isTerminated) 394 { 395 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length); 396 arrTerminatorBlacklist[_lastTerm.position] = true; 397 } 398 if (node.getEndTagLength == 0) 399 { 400 //No Terminator has been found - we assume, that this is a self-terminator 401 node.setEndTagLength(node.getStartTagLength()); 402 } 403 } 404 405 this.nodes.sort!("a.getStartPosition() < b.getStartPosition()"); 406 407 foreach (size_t i, Node node; this.nodes) 408 { 409 for (size_t back = 1; back <= i; back++) 410 { 411 if (this.nodes[i - back].getEndPosition() > node.getEndPosition()) 412 { 413 node.setParent(&this.nodes[i - back]); 414 node.getParent().addChild(&this.nodes[i]); 415 break; 416 } 417 } 418 } 419 } 420 421 /** 422 returns all found Nodes. 423 Please note, that also Nodes will be returned which was found in comments. 424 use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments() 425 426 returns: 427 Nodes[] 428 */ 429 public Node[] getNodes() 430 { 431 return this.nodes; 432 } 433 434 /** 435 gets the Tag Name of the Node 436 437 Params: 438 node = The Node to get the Tag Name from 439 440 Returns: 441 The Tag Name as string 442 */ 443 public string getStartElement(Node node) 444 { 445 return this.haystack[node.getStartPosition() .. ( 446 node.getStartPosition() + node.getStartTagLength())]; 447 } 448 449 /** 450 gets the part of the loaded Document from the nodes begining to its end 451 452 Params: 453 node = The Node from which you want to get the Document representation 454 */ 455 public string getElement(Node node) 456 { 457 return this.haystack[node.getStartPosition() .. ( 458 node.getEndPosition() + node.getEndTagLength())]; 459 } 460 461 /** 462 * gets the Inner-HTML from the given node 463 * 464 * Params: 465 * node = The Node from which you want to get the Inner-HTML 466 */ 467 public string getInner(Node node) 468 { 469 if ( node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength()) ) 470 { 471 return this.haystack[(node.getStartPosition() + node.getStartTagLength())..(node.getEndPosition())]; 472 } 473 return ""; 474 } 475 476 /** 477 * Removes tags and returns plain inner content 478 */ 479 public string stripTags(Node node) { 480 import std.algorithm.searching : any; 481 import std.array : appender; 482 auto inner = appender!string(); 483 Node[] descendants = node.getDescendants(); 484 for(size_t i = node.getStartPosition + node.getStartTagLength ; i < node.getEndPosition ; i++) { 485 if( ! 486 any!(desc => 487 isBetween(i , desc.getStartPosition()-1 , desc.getStartPosition()+desc.getStartTagLength()) 488 || isBetween(i , desc.getEndPosition()-1 , desc.getEndPosition()+desc.getEndTagLength()) 489 )(descendants) 490 ) { 491 inner.put(this.haystack[i]); 492 } 493 } 494 return inner.data; 495 } 496 497 /** 498 * Removes tags and returns plain inner content 499 */ 500 public string stripTags() { 501 if( ! this.nodes.length) { 502 return ""; 503 } 504 return this.stripTags((*this.nodes.ptr)); 505 } 506 /// 507 unittest { 508 const string content = `<div><h2>bla</h2><p>fasel</p></div>`; 509 Dominator dom = new Dominator(content); 510 assert( dom.stripTags() == "blafasel", dom.stripTags()); 511 } 512 } 513 514 unittest { 515 const string content = `<div data-function=">"> 516 <ol id="ol-1"> 517 <li id="li-1-ol-1">li-1-ol-1 Inner</li> 518 <li id="li-2-ol-1">li-2-ol-1 Inner</li> 519 <li id="li-3-ol-1">li-3-ol-1 Inner</li> 520 </ol> 521 </div>`; 522 Dominator dom = new Dominator(content); 523 assert( dom.getNodes.length == 5); 524 assert( dom.filterDom(DomFilter("ol")).length == 1 ); 525 assert( dom.filterDom(DomFilter("ol.li")).length == 3 ); 526 assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 ); 527 } 528 529 /// get descendants of a specific Node and apply further filtering on the result. 530 unittest { 531 const string content = `<div data-function="<some weird> stuff"> 532 <span> 533 <span> 534 <span>bäm!</span> 535 </span> 536 <span>boing!</span> 537 </span> 538 <ol id="ol-1"> 539 <li id="li-1-ol-1">li-1-ol-1 Inner</li> 540 <li id="li-2-ol-1">li-2-ol-1 Inner</li> 541 <li id="li-3-ol-1">li-3-ol-1 Inner</li> 542 </ol> 543 </div>`; 544 545 Dominator dom = new Dominator(content); 546 Node [] descendants = (*dom.filterDom("div").ptr).getDescendants(); 547 assert( descendants.filterDom("span").length == 4 ); 548 assert( descendants.filterDom("li").length == 3 ); 549 assert( descendants.filterDom("ol").length == 1 ); 550 } 551 552 /// basic example 553 unittest { 554 const string html = 555 `<div> 556 <p>Here comes a list!</p> 557 <ul> 558 <li class="wanted">one</li> 559 <!-- <li>two</li> --> 560 <li class="wanted hard">three</li> 561 <li id="item-4">four</li> 562 <li checked>five</li> 563 <li id="item-6">six</li> 564 </ul> 565 <p>another list</p> 566 <ol> 567 <li>eins</li> 568 <li>zwei</li> 569 <li>drei</li> 570 </ol> 571 <p>have a nice day</p> 572 </div>`; 573 Dominator dom = new Dominator(html); 574 575 foreach(node ; dom.filterDom("ul.li")) { 576 //do something more usefull with the node then: 577 assert(node.getParent.getTag() == "ul"); 578 } 579 580 Node[] nodes = dom.filterDom("ul.li"); 581 assert(dom.getInner( nodes[0] ) == "one" ); 582 assert(nodes[0].getAttributes() == [ Attribute("class","wanted") ] , to!(string)(nodes[0].getAttributes()) ); 583 assert(Attribute("class","wanted").matches(nodes[0])); 584 assert(Attribute("class","wanted").matches(nodes[2])); 585 assert(Attribute("class",["wanted","hard"]).matches(nodes[2])); 586 assert(nodes[1].isComment()); 587 588 assert(dom.filterDom("ul.li").length == 6); 589 assert(dom.filterDom("ul.li").filterComments.length == 5); 590 assert(dom.filterDom("li").length == 9); 591 assert(dom.filterDom("li[1]").length == 1); //the first li in the dom 592 assert(dom.filterDom("*.li[1]").length == 2); //the first li in ul and first li in ol 593 assert(dom.getInner( (*dom.filterDom("*{checked:}").ptr) ) == "five"); 594 595 } 596 597 unittest { 598 Dominator dom = new Dominator(readText("tests/dummy.html")); 599 auto filter = DomFilter("article"); 600 assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length)); 601 assert( dom.filterDom(filter).length == 6); 602 603 assert( dom.filterDom("div.*.ol.li").length == 3 ); 604 assert( dom.filterDom("div.ol.li").length == 6 ); 605 assert( dom.filterDom("ol.li").length == 6 ); 606 assert( dom.filterDom(`ol.li{id:(regex)^li-[\d]+}`).length == 6 ); 607 assert( dom.filterDom(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`).length == 3 ); 608 assert( dom.filterDom(`*{checked:}`).length == 1 ); 609 assert( dom.filterDom(`onelinenested`).length == 2 ); 610 assert( dom.filterDom(`onelinenested{class:level1}`).length == 1 ); 611 assert( dom.filterDom(`onelinenested{class:level2}`).length == 1 ); 612 assert( dom.filterDom(`onelinenested.onelinenested`).length == 1 ); 613 614 /** 615 * Find nodes with a special href. 616 */ 617 filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`); 618 assert( dom.filterDom(filter).length); 619 foreach(Node foundNode ; dom.filterDom(filter)) { 620 assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) ); 621 } 622 623 /** 624 * Find nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks. 625 */ 626 filter = DomFilter(`*{href://www.google.com/}`); 627 assert( dom.filterDom(filter).length); 628 foreach(Node foundNode ; dom.filterDom(filter)) { 629 assert( Attribute("href","//www.google.com/").matches(foundNode) ); 630 } 631 } 632 /* 633 * trouble with uppercase tags 634 */ 635 unittest 636 { 637 Dominator dom = new Dominator(readText("tests/dummy.html")); 638 foreach(node ; dom.filterDom("scpdurl")) 639 { 640 assert( dom.getInner(node) == "/timeSCPD.xml" ); 641 } 642 }