1 /**
2  * Copyright:
3  * (C) 2016 Martin Brzenska
4  *
5  * License:
6  * Distributed under the terms of the MIT license.
7  * Consult the provided LICENSE.md file for details
8  */
9 module libdominator.Dominator;
10 
11 import std.conv : to;
12 
13 import libdominator.Attribute;
14 import libdominator.Node;
15 
16 version(unittest) {
17     import libdominator.Filter;
18     import std.file;
19 }
20 
21 struct comment
22 {
23     size_t begin;
24     size_t end;
25 }
26 
27 struct terminator
28 {
29     size_t position;
30     size_t length;
31 }
32 
33 bool isBetween(in size_t needle, in size_t from, in size_t to)
34 {
35     return (needle > from && needle < to);
36 }
37 
38 ///Parse, hierarchize, analyse xhtml
39 class Dominator
40 {
41     private string haystack;
42     private size_t needle;
43     private comment[] comments;
44 
45     private Node[] nodes;
46 
47     /**
48      Instantiate empty Dominator
49     */
50     this() {}
51 
52     /**
53      Instantiate object and load the Document
54     */
55     this(string haystack)
56     {
57         this.load(haystack);
58     }
59 
60     /**
61      loads a Document
62      Params:
63        haystack = the Document String
64     */
65     public Dominator load(string haystack)
66     {
67         this.haystack = haystack;
68         this.parse();
69         return this;
70     }
71 
72     private bool tryElementOpener(ref Node node, ref size_t needle)
73     {
74         import std.ascii : isWhite , isAlphaNum , isAlpha;
75         enum ParserStates : ubyte {
76             name = 1,
77             key = 2,
78             value = 4,
79             err = 8,
80             ready = 16
81         }
82         ubyte state = 0;
83 
84         char inQuote = 0x00;
85         size_t nameCoord;
86         size_t[2] keyCoord, valCoord;
87 
88         if(this.haystack[needle] != '<') {
89             return false;
90         }
91         node.setStartPosition(needle);
92         needle++;
93 
94         /*
95         * parse the elements name
96         */
97         //first, skip whitespaces
98         while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
99 
100         //The name begins with a underscore or a alphabetical character.
101         if(
102             ! this.haystack[needle].isAlpha
103             && ! this.haystack[needle] == '_'
104         ) {
105             return false;
106         }
107         nameCoord = needle;
108 
109         //The name can contain letters, digits, hyphens, underscores, and periods
110         for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) {
111             if(
112                 ! this.haystack[needle].isAlphaNum
113                 &&  this.haystack[needle] != '-'
114                 &&  this.haystack[needle] != '_'
115                 &&  this.haystack[needle] != '.'
116                 &&  this.haystack[needle] != ':'
117             ) {
118                 if(this.haystack[needle] == '>') {
119                     state |= ParserStates.ready;
120                     break;
121                 } else {
122                     return false;
123                 }
124             }
125         }
126         node.setTag(this.haystack[nameCoord..needle]);
127         state |= ParserStates.name;
128 
129         /*
130         * Parse attributes
131         */
132         while( ! (state & ParserStates.ready))
133         {
134             //reset state
135             state &= ~(ParserStates.key | ParserStates.value);
136 
137             //Check if the next non-whitespace char finishes our job here
138             while(needle < this.haystack.length && this.haystack[needle].isWhite){ needle++; }
139             if(this.haystack[needle] == '>') {
140                 state |= ParserStates.ready;
141                 break;
142             }
143             /*
144             * Find the attr-key
145             */
146             keyCoord[0] = needle;
147             for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) {
148                 if(this.haystack[needle] == '>') {
149                     state |= ParserStates.ready;
150                     break;
151                 }
152                 if(this.haystack[needle] == '=') {
153                     break;
154                 }
155             }
156             keyCoord[1] = needle;
157 
158             if(state & ParserStates.ready) {
159                 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) );
160             }
161             else {
162                 /**
163                 * Parse attribute value if exist
164                 */
165 
166                 //skip whitespaces
167                 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
168 
169                 if(this.haystack[needle] == '>') {
170                     //there is no value and this is the end
171                     state |= ParserStates.ready;
172                     node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) );
173                     break;
174                 }
175                 else if(this.haystack[needle] == '=')
176                 {
177                     //here comes the value...
178                     needle++;
179                     while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
180                     if(this.haystack[needle] == '"' || this.haystack[needle] == '\'' ) {
181                         //quoted value
182                         inQuote = this.haystack[needle];
183                         needle++;
184                         valCoord[0] = needle;
185                         while(
186                             needle < this.haystack.length
187                             && (this.haystack[needle] != inQuote
188                             || ( this.haystack[needle] == inQuote && this.haystack[needle-1] == '\\'))
189                         ) {
190                             needle++;
191                         }
192                         valCoord[1] = needle;
193                         needle++;
194                     }
195                     else {
196                         //not quoted value
197                         inQuote = 0x00;
198                         valCoord[0] = needle;
199                         while(
200                             needle < this.haystack.length
201                             && (!this.haystack[needle].isWhite
202                             || ( this.haystack[needle].isWhite && this.haystack[needle-1] == '\\'))
203                         ) {
204                             if(this.haystack[needle] == '>') {
205                                 state |= ParserStates.ready;
206                                 break;
207                             }
208                             needle++;
209                         }
210                         if(state & ParserStates.ready) {
211                             valCoord[1] = needle;
212                         }
213                         else {
214                             valCoord[1] = needle;
215                             needle++;
216                         }
217                     }
218                 }
219                 else {
220                     //there is no value
221                 }
222                 if(keyCoord[1] >= this.haystack.length || valCoord[1] >= this.haystack.length) {
223                     return false;
224                 }
225                 node.addAttribute(Attribute(
226                     this.haystack[keyCoord[0]..keyCoord[1]],
227                     this.haystack[valCoord[0]..valCoord[1]]
228                 ));
229             }
230         }
231         node.setStartTagLength( 1 + needle - node.getStartPosition() );
232         return true;
233     }
234 
235     private bool tryElementTerminator(ref terminator[][string] terminators , ref size_t needle) {
236         import std.ascii : isWhite;
237         import std.uni : toLower;
238         if( this.haystack[needle] != '<' ) {return false;}
239 
240         size_t[2] nameCoord;
241         size_t position = needle;
242 
243         for(needle = needle + 1; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {}
244         if(this.haystack[needle] != '/') { return false; }
245         nameCoord[0] = 1 + needle;
246         for(
247             needle = needle + 1 ;
248             needle < this.haystack.length
249             && !this.haystack[needle].isWhite ;
250             needle++
251         ) {
252             if(this.haystack[needle] == '>') {
253                 terminators[this.haystack[nameCoord[0]..needle].toLower] ~= terminator(position , 1 + needle - position);
254                 return true;
255             }
256         }
257         nameCoord[1] = needle;
258         for(; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {}
259         if(this.haystack[needle] == '>') {
260             terminators[this.haystack[nameCoord[0]..nameCoord[1]].toLower] ~= terminator(position , 1 + needle - position);
261             return true;
262         }
263         return false;
264     }
265 
266     private bool tryCommentOpener(ref size_t needle) {
267         if(
268             needle + 4 < this.haystack.length
269             && this.haystack[needle..needle+4] == "<!--"
270         ) {
271             needle = needle+4;
272             return true;
273         }
274         return false;
275     }
276     private bool tryCommentTerminator(ref size_t needle) {
277         if(
278             needle + 3 < this.haystack.length
279             && this.haystack[needle..needle+3] == "-->"
280         ) {
281             needle = needle+3;
282             return true;
283         }
284         return false;
285     }
286     private void parse()
287     {
288         import std.ascii : isWhite , isAlphaNum , isAlpha;
289         import std.array : appender;
290 
291         //Reset parsed state
292         this.nodes = [];
293         this.comments = [];
294 
295         if(this.haystack.length == 0) { return; }
296         Node[] nodes;
297         auto nodeAppender = appender(nodes);
298         terminator[][string] terminators;
299         size_t needleProbe;
300         enum ParserStates : ubyte {
301             inElementOpener = 1,
302             inComment = 2
303         }
304         ubyte state;
305         size_t needle;
306 
307         do {
308             if(this.haystack[needle] == '<') {
309 
310                 needleProbe = needle;
311                 if( this.tryElementTerminator(terminators , needleProbe) )
312                 {
313                     needle = needleProbe;
314                     continue;
315                 }
316 
317                 needleProbe = needle;
318                 if(
319                     !(state & ParserStates.inComment)
320                     && this.tryCommentOpener(needleProbe)
321                 ) {
322                     needle = needleProbe;
323                     state |= ParserStates.inComment;
324                     continue;
325                 }
326 
327                 needleProbe = needle;
328                 Node elem = new Node();
329                 if( this.tryElementOpener(elem , needleProbe) )
330                 {
331                     needle = needleProbe;
332                     if(state & ParserStates.inComment) {elem.isComment(true);}
333                     nodeAppender.put(elem);
334                     continue;
335                 }
336 
337             }
338             else if(
339                 this.haystack[needle] == '-'
340                 && state & ParserStates.inComment
341             ) {
342                 needleProbe = needle;
343                 if( this.tryCommentTerminator(needleProbe) ) {
344                     needle = needleProbe;
345                     state &= ~ParserStates.inComment;
346                     continue;
347                 }
348             }
349 
350             needle++;
351         } while(needle < this.haystack.length);
352 
353         this.nodes = nodeAppender.data;
354         this.hierarchize(terminators);
355     }
356 
357     private void hierarchize(terminator[][string] terminators)
358     {
359         import std.algorithm.sorting : sort;
360         import std.uni : toLower;
361 
362         bool[size_t] arrTerminatorBlacklist;
363         foreach_reverse (Node node; this.nodes)
364         {
365             terminator _lastTerm = terminator(node.getStartPosition(), 0);
366             bool isTerminated = false;
367 
368             if(node.getTag().toLower in terminators) {
369                 foreach_reverse (terminator terminatorCandi; terminators[ node.getTag().toLower ])
370                 {
371                     if (terminatorCandi.position in arrTerminatorBlacklist)
372                     {
373                         //skip if already checked and marked as a false candidate
374                         continue;
375                     }
376                     if (node.getStartPosition() > terminatorCandi.position)
377                     {
378                         /*
379                         * The candidates position is lower then the position of the node, for which we are searching the terminator.
380                         * This means, that the last candidate, that we have checked, was the right one - if there was one.
381                         */
382                         arrTerminatorBlacklist[_lastTerm.position] = true;
383                         node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length);
384                         isTerminated = true;
385                         break;
386                     }
387                     else
388                     {
389                         _lastTerm = terminatorCandi;
390                     }
391                 }
392             }
393             if (!isTerminated)
394             {
395                 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length);
396                 arrTerminatorBlacklist[_lastTerm.position] = true;
397             }
398             if (node.getEndTagLength == 0)
399             {
400                 //No Terminator has been found - we assume, that this is a self-terminator
401                 node.setEndTagLength(node.getStartTagLength());
402             }
403         }
404 
405         this.nodes.sort!("a.getStartPosition() < b.getStartPosition()");
406 
407         foreach (size_t i, Node node; this.nodes)
408         {
409             for (size_t back = 1; back <= i; back++)
410             {
411                 if (this.nodes[i - back].getEndPosition() > node.getEndPosition())
412                 {
413                     node.setParent(&this.nodes[i - back]);
414                     node.getParent().addChild(&this.nodes[i]);
415                     break;
416                 }
417             }
418         }
419     }
420 
421     /**
422      returns all found Nodes.
423      Please note, that also Nodes will be returned which was found in comments.
424      use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments()
425 
426      returns:
427       Nodes[]
428     */
429     public Node[] getNodes()
430     {
431         return this.nodes;
432     }
433 
434     /**
435      gets the Tag Name of the Node
436 
437      Params:
438         node = The Node to get the Tag Name from
439 
440      Returns:
441       The Tag Name as string
442     */
443     public string getStartElement(Node node)
444     {
445         return this.haystack[node.getStartPosition() .. (
446                 node.getStartPosition() + node.getStartTagLength())];
447     }
448 
449     /**
450      gets the part of the loaded Document from the nodes begining to its end
451 
452      Params:
453         node = The Node from which you want to get the Document representation
454     */
455     public string getElement(Node node)
456     {
457         return this.haystack[node.getStartPosition() .. (
458                 node.getEndPosition() + node.getEndTagLength())];
459     }
460 
461     /**
462     * gets the Inner-HTML from the given node
463     *
464     * Params:
465     *    node = The Node from which you want to get the Inner-HTML
466     */
467     public string getInner(Node node)
468     {
469         if ( node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength()) )
470         {
471             return this.haystack[(node.getStartPosition() + node.getStartTagLength())..(node.getEndPosition())];
472         }
473         return "";
474     }
475 
476     /**
477     * Removes tags and returns plain inner content
478     */
479     public string stripTags(Node node) {
480         import std.algorithm.searching : any;
481         import std.array : appender;
482         auto inner = appender!string();
483         Node[] descendants = node.getDescendants();
484         for(size_t i = node.getStartPosition + node.getStartTagLength ; i < node.getEndPosition ; i++) {
485             if( !
486                 any!(desc =>
487                 isBetween(i , desc.getStartPosition()-1 , desc.getStartPosition()+desc.getStartTagLength())
488                 || isBetween(i , desc.getEndPosition()-1 , desc.getEndPosition()+desc.getEndTagLength())
489                 )(descendants)
490             ) {
491                 inner.put(this.haystack[i]);
492             }
493         }
494         return inner.data;
495     }
496 
497     /**
498     * Removes tags and returns plain inner content
499     */
500     public string stripTags() {
501         if( ! this.nodes.length) {
502             return "";
503         }
504         return this.stripTags((*this.nodes.ptr));
505     }
506     ///
507     unittest {
508         const string content = `<div><h2>bla</h2><p>fasel</p></div>`;
509         Dominator dom = new Dominator(content);
510         assert( dom.stripTags() == "blafasel", dom.stripTags());
511     }
512 }
513 
514 unittest {
515     const string content = `<div data-function=">">
516         <ol id="ol-1">
517           <li id="li-1-ol-1">li-1-ol-1 Inner</li>
518           <li id="li-2-ol-1">li-2-ol-1 Inner</li>
519           <li id="li-3-ol-1">li-3-ol-1 Inner</li>
520         </ol>
521       </div>`;
522       Dominator dom = new Dominator(content);
523       assert( dom.getNodes.length == 5);
524       assert( dom.filterDom(DomFilter("ol")).length == 1 );
525       assert( dom.filterDom(DomFilter("ol.li")).length == 3 );
526       assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 );
527 }
528 
529 /// get descendants of a specific Node and apply further filtering on the result.
530 unittest {
531     const string content = `<div data-function="<some weird> stuff">
532         <span>
533             <span>
534                 <span>bäm!</span>
535             </span>
536             <span>boing!</span>
537         </span>
538         <ol id="ol-1">
539           <li id="li-1-ol-1">li-1-ol-1 Inner</li>
540           <li id="li-2-ol-1">li-2-ol-1 Inner</li>
541           <li id="li-3-ol-1">li-3-ol-1 Inner</li>
542         </ol>
543       </div>`;
544 
545       Dominator dom = new Dominator(content);
546       Node [] descendants = (*dom.filterDom("div").ptr).getDescendants();
547       assert( descendants.filterDom("span").length == 4 );
548       assert( descendants.filterDom("li").length == 3 );
549       assert( descendants.filterDom("ol").length == 1 );
550 }
551 
552 /// basic example
553 unittest {
554     const string html =
555     `<div>
556         <p>Here comes a list!</p>
557         <ul>
558             <li class="wanted">one</li>
559             <!-- <li>two</li> -->
560             <li class="wanted hard">three</li>
561             <li id="item-4">four</li>
562             <li checked>five</li>
563             <li id="item-6">six</li>
564         </ul>
565         <p>another list</p>
566         <ol>
567             <li>eins</li>
568             <li>zwei</li>
569             <li>drei</li>
570         </ol>
571         <p>have a nice day</p>
572     </div>`;
573     Dominator dom = new Dominator(html);
574 
575     foreach(node ; dom.filterDom("ul.li")) {
576         //do something more usefull with the node then:
577         assert(node.getParent.getTag() == "ul");
578     }
579 
580     Node[] nodes = dom.filterDom("ul.li");
581     assert(dom.getInner( nodes[0] ) == "one" );
582     assert(nodes[0].getAttributes() == [ Attribute("class","wanted") ] , to!(string)(nodes[0].getAttributes()) );
583     assert(Attribute("class","wanted").matches(nodes[0]));
584     assert(Attribute("class","wanted").matches(nodes[2]));
585     assert(Attribute("class",["wanted","hard"]).matches(nodes[2]));
586     assert(nodes[1].isComment());
587 
588     assert(dom.filterDom("ul.li").length == 6);
589     assert(dom.filterDom("ul.li").filterComments.length == 5);
590     assert(dom.filterDom("li").length == 9);
591     assert(dom.filterDom("li[1]").length == 1); //the first li in the dom
592     assert(dom.filterDom("*.li[1]").length == 2); //the first li in ul and first li in ol
593     assert(dom.getInner( (*dom.filterDom("*{checked:}").ptr) ) == "five");
594 
595 }
596 
597 unittest {
598     Dominator dom = new Dominator(readText("tests/dummy.html"));
599     auto filter = DomFilter("article");
600     assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length));
601     assert( dom.filterDom(filter).length == 6);
602 
603     assert( dom.filterDom("div.*.ol.li").length == 3 );
604     assert( dom.filterDom("div.ol.li").length == 6 );
605     assert( dom.filterDom("ol.li").length == 6 );
606     assert( dom.filterDom(`ol.li{id:(regex)^li-[\d]+}`).length == 6 );
607     assert( dom.filterDom(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`).length == 3 );
608     assert( dom.filterDom(`*{checked:}`).length == 1 );
609     assert( dom.filterDom(`onelinenested`).length == 2 );
610     assert( dom.filterDom(`onelinenested{class:level1}`).length == 1 );
611     assert( dom.filterDom(`onelinenested{class:level2}`).length == 1 );
612     assert( dom.filterDom(`onelinenested.onelinenested`).length == 1 );
613 
614     /**
615     * Find nodes with a special href.
616     */
617     filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`);
618     assert( dom.filterDom(filter).length);
619     foreach(Node foundNode ; dom.filterDom(filter)) {
620         assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) );
621     }
622 
623     /**
624     * Find nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks.
625     */
626     filter = DomFilter(`*{href://www.google.com/}`);
627     assert( dom.filterDom(filter).length);
628     foreach(Node foundNode ; dom.filterDom(filter)) {
629         assert( Attribute("href","//www.google.com/").matches(foundNode) );
630     }
631 }
632 /*
633 * trouble with uppercase tags
634 */
635 unittest
636 {
637     Dominator dom = new Dominator(readText("tests/dummy.html"));
638     foreach(node ; dom.filterDom("scpdurl"))
639     {
640         assert( dom.getInner(node) == "/timeSCPD.xml" );
641     }
642 }