1 /**
2  * Copyright:
3  * (C) 2016 Martin Brzenska
4  *
5  * License:
6  * Distributed under the terms of the MIT license.
7  * Consult the provided LICENSE.md file for details
8  */
9 module libdominator.Dominator;
10 
11 import std.conv : to;
12 
13 import libdominator.Attribute;
14 import libdominator.Node;
15 
16 version(unittest) {
17     import libdominator.Filter;
18     import std.file;
19 }
20 
21 struct comment
22 {
23     size_t begin;
24     size_t end;
25 }
26 
27 struct terminator
28 {
29     size_t position;
30     size_t length;
31 }
32 
33 pragma(inline,true):
34 bool isBetween(in size_t needle, in size_t from, in size_t to)
35 {
36     return (needle > from && needle < to);
37 }
38 
39 ///Parse, hierarchize, analyse xhtml
40 class Dominator
41 {
42     private string haystack;
43     private size_t needle;
44     private comment[] comments;
45 
46     private Node[] nodes;
47 
48     /**
49      Instantiate empty Dominator
50     */
51     this() {}
52 
53     /**
54      Instantiate object and load the Document
55     */
56     this(string haystack)
57     {
58         this.load(haystack);
59     }
60 
61     /**
62      loads a Document
63      Params:
64        haystack = the Document String
65     */
66     public Dominator load(string haystack)
67     {
68         this.haystack = haystack;
69         this.parse();
70         return this;
71     }
72 
73     private bool tryElementOpener(ref Node node, ref size_t needle) {
74         import std.ascii : isWhite , isAlphaNum , isAlpha;
75         enum ParserStates : ubyte {
76             name = 1,
77             key = 2,
78             value = 4,
79             err = 8,
80             ready = 16
81         }
82         ubyte state = 0;
83 
84         char inQuote = 0x00;
85         size_t nameCoord;
86         size_t[2] keyCoord, valCoord;
87 
88         if(this.haystack[needle] != '<') {
89             return false;
90         }
91         node.setStartPosition(needle);
92         needle++;
93 
94         /*
95         * parse the elements name
96         */
97         //first, skip whitespaces
98         while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
99 
100         //The name begins with a underscore or a alphabetical character.
101         if(
102             ! this.haystack[needle].isAlpha
103             && ! this.haystack[needle] == '_'
104         ) {
105             return false;
106         }
107         nameCoord = needle;
108 
109         //The name can contain letters, digits, hyphens, underscores, and periods
110         for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) {
111             if(
112                 ! this.haystack[needle].isAlphaNum
113                 &&  this.haystack[needle] != '-'
114                 &&  this.haystack[needle] != '_'
115                 &&  this.haystack[needle] != '.'
116                 &&  this.haystack[needle] != ':'
117             ) {
118                 if(this.haystack[needle] == '>') {
119                     state |= ParserStates.ready;
120                     break;
121                 } else {
122                     return false;
123                 }
124             }
125         }
126         node.setTag(this.haystack[nameCoord..needle]);
127         state |= ParserStates.name;
128 
129         /*
130         * Parse attributes
131         */
132         while( ! (state & ParserStates.ready))
133         {
134             //reset state
135             state &= ~(ParserStates.key | ParserStates.value);
136 
137             //Check if the next non-whitespace char finishes our job here
138             while(needle < this.haystack.length && this.haystack[needle].isWhite){ needle++; }
139             if(this.haystack[needle] == '>') {
140                 state |= ParserStates.ready;
141                 break;
142             }
143             /*
144             * Find the attr-key
145             */
146             keyCoord[0] = needle;
147             for(; needle < this.haystack.length && !this.haystack[needle].isWhite ; ++needle) {
148                 if(this.haystack[needle] == '>') {
149                     state |= ParserStates.ready;
150                     break;
151                 }
152                 if(this.haystack[needle] == '=') {
153                     break;
154                 }
155             }
156             keyCoord[1] = needle;
157 
158             if(state & ParserStates.ready) {
159                 node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) );
160             }
161             else {
162                 /**
163                 * Parse attribute value if exist
164                 */
165 
166                 //skip whitespaces
167                 while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
168 
169                 if(this.haystack[needle] == '>') {
170                     //there is no value and this is the end
171                     state |= ParserStates.ready;
172                     node.addAttribute( Attribute(this.haystack[keyCoord[0]..keyCoord[1]] , "" ) );
173                     break;
174                 }
175                 else if(this.haystack[needle] == '=')
176                 {
177                     //here comes the value...
178                     needle++;
179                     while(needle < this.haystack.length && this.haystack[needle].isWhite) { needle++; }
180                     if(this.haystack[needle] == '"' || this.haystack[needle] == '\'' ) {
181                         //quoted value
182                         inQuote = this.haystack[needle];
183                         needle++;
184                         valCoord[0] = needle;
185                         while(
186                             needle < this.haystack.length
187                             && (this.haystack[needle] != inQuote
188                             || ( this.haystack[needle] == inQuote && this.haystack[needle-1] == '\\'))
189                         ) {
190                             needle++;
191                         }
192                         valCoord[1] = needle;
193                         needle++;
194                     }
195                     else {
196                         //not quoted value
197                         inQuote = 0x00;
198                         valCoord[0] = needle;
199                         while(
200                             needle < this.haystack.length
201                             && (!this.haystack[needle].isWhite
202                             || ( this.haystack[needle].isWhite && this.haystack[needle-1] == '\\'))
203                         ) {
204                             if(this.haystack[needle] == '>') {
205                                 state |= ParserStates.ready;
206                                 break;
207                             }
208                             needle++;
209                         }
210                         if(state & ParserStates.ready) {
211                             valCoord[1] = needle;
212                         }
213                         else {
214                             valCoord[1] = needle;
215                             needle++;
216                         }
217                     }
218                 }
219                 else {
220                     //there is no value
221                 }
222                 if(keyCoord[1] >= this.haystack.length || valCoord[1] >= this.haystack.length) {
223                     return false;
224                 }
225                 node.addAttribute(Attribute(
226                     this.haystack[keyCoord[0]..keyCoord[1]],
227                     this.haystack[valCoord[0]..valCoord[1]]
228                 ));
229             }
230         }
231         node.setStartTagLength( 1 + needle - node.getStartPosition() );
232         return true;
233     }
234 
235     private bool tryElementTerminator(ref terminator[][string] terminators , ref size_t needle) {
236         import std.ascii : isWhite;
237 
238         if( this.haystack[needle] != '<' ) {return false;}
239 
240         size_t[2] nameCoord;
241         size_t position = needle;
242 
243         for(needle = needle + 1; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {}
244         if(this.haystack[needle] != '/') { return false; }
245         nameCoord[0] = 1 + needle;
246         for(
247             needle = needle + 1 ;
248             needle < this.haystack.length
249             && !this.haystack[needle].isWhite ;
250             needle++
251         ) {
252             if(this.haystack[needle] == '>') {
253                 terminators[this.haystack[nameCoord[0]..needle]] ~= terminator(position , 1 + needle - position);
254                 return true;
255             }
256         }
257         nameCoord[1] = needle;
258         for(; needle < this.haystack.length && this.haystack[needle].isWhite ; needle++) {}
259         if(this.haystack[needle] == '>') {
260             terminators[this.haystack[nameCoord[0]..nameCoord[1]]] ~= terminator(position , 1 + needle - position);
261             return true;
262         }
263         return false;
264     }
265     private bool tryCommentOpener(ref size_t needle) {
266         if(
267             needle + 4 < this.haystack.length
268             && this.haystack[needle..needle+4] == "<!--"
269         ) {
270             needle = needle+4;
271             return true;
272         }
273         return false;
274     }
275     private bool tryCommentTerminator(ref size_t needle) {
276         if(
277             needle + 3 < this.haystack.length
278             && this.haystack[needle..needle+3] == "-->"
279         ) {
280             needle = needle+3;
281             return true;
282         }
283         return false;
284     }
285     private void parse()
286     {
287         import std.ascii : isWhite , isAlphaNum , isAlpha;
288         import std.array : appender;
289 
290         //Reset parsed state
291         this.nodes = [];
292         this.comments = [];
293 
294         if(this.haystack.length == 0) { return; }
295         Node[] nodes;
296         auto nodeAppender = appender(nodes);
297         terminator[][string] terminators;
298         size_t needleProbe;
299         enum ParserStates : ubyte {
300             inElementOpener = 1,
301             inComment = 2
302         }
303         ubyte state;
304         size_t needle;
305 
306         do {
307             if(this.haystack[needle] == '<') {
308 
309                 needleProbe = needle;
310                 if( this.tryElementTerminator(terminators , needleProbe) ) {
311                     needle = needleProbe;
312                     continue;
313                 }
314 
315                 needleProbe = needle;
316                 if(
317                     !(state & ParserStates.inComment)
318                     && this.tryCommentOpener(needleProbe)
319                 ) {
320                     needle = needleProbe;
321                     state |= ParserStates.inComment;
322                     continue;
323                 }
324 
325                 needleProbe = needle;
326                 Node elem = new Node();
327                 if( this.tryElementOpener(elem , needleProbe) ) {
328 
329 
330                     needle = needleProbe;
331                     if(state & ParserStates.inComment) {elem.isComment(true);}
332                     nodeAppender.put(elem);
333                     continue;
334                 }
335 
336             }
337             else if(
338                 this.haystack[needle] == '-'
339                 && state & ParserStates.inComment
340             ) {
341                 needleProbe = needle;
342                 if( this.tryCommentTerminator(needleProbe) ) {
343                     needle = needleProbe;
344                     state &= ~ParserStates.inComment;
345                     continue;
346                 }
347             }
348 
349             needle++;
350         } while(needle < this.haystack.length);
351 
352         this.nodes = nodeAppender.data;
353         this.hierarchize(terminators);
354     }
355 
356     private void hierarchize(terminator[][string] terminators)
357     {
358         import std.algorithm.sorting : sort;
359 
360         bool[size_t] arrTerminatorBlacklist;
361         foreach_reverse (Node node; this.nodes)
362         {
363             terminator _lastTerm = terminator(node.getStartPosition(), 0);
364             bool isTerminated = false;
365 
366             if(node.getTag() in terminators) {
367                 foreach_reverse (terminator terminatorCandi; terminators[node.getTag()])
368                 {
369                     if (terminatorCandi.position in arrTerminatorBlacklist)
370                     {
371                         //skip if already checked and marked as a false candidate
372                         continue;
373                     }
374                     if (node.getStartPosition() > terminatorCandi.position)
375                     {
376                         /*
377                         * The candidates position is lower then the position of the node, for which we are searching the terminator.
378                         * This means, that the last candidate, that we have checked, was the right one - if there was one.
379                         */
380                         arrTerminatorBlacklist[_lastTerm.position] = true;
381                         node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length);
382                         isTerminated = true;
383                         break;
384                     }
385                     else
386                     {
387                         _lastTerm = terminatorCandi;
388                     }
389                 }
390             }
391             if (!isTerminated)
392             {
393                 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length);
394                 arrTerminatorBlacklist[_lastTerm.position] = true;
395             }
396             if (node.getEndTagLength == 0)
397             {
398                 //No Terminator has been found - we assume, that this is a self-terminator
399                 node.setEndTagLength(node.getStartTagLength());
400             }
401         }
402 
403         this.nodes.sort!("a.getStartPosition() < b.getStartPosition()");
404 
405         foreach (size_t i, Node node; this.nodes)
406         {
407             for (size_t back = 1; back <= i; back++)
408             {
409                 if (this.nodes[i - back].getEndPosition() > node.getEndPosition())
410                 {
411                     node.setParent(&this.nodes[i - back]);
412                     node.getParent().addChild(&this.nodes[i]);
413                     break;
414                 }
415             }
416         }
417     }
418 
419     /**
420      returns all found Nodes.
421      Please note, that also Nodes will be returned which was found in comments.
422      use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments()
423 
424      returns:
425       Nodes[]
426     */
427     public Node[] getNodes()
428     {
429         return this.nodes;
430     }
431 
432     /**
433      gets the Tag Name of the Node
434 
435      Params:
436         node = The Node to get the Tag Name from
437 
438      Returns:
439       The Tag Name as string
440     */
441     public string getStartElement(Node node)
442     {
443         return this.haystack[node.getStartPosition() .. (
444                 node.getStartPosition() + node.getStartTagLength())];
445     }
446 
447     /**
448      gets the part of the loaded Document from the nodes begining to its end
449 
450      Params:
451         node = The Node from which you want to get the Document representation
452     */
453     public string getElelment(Node node)
454     {
455         return this.haystack[node.getStartPosition() .. (
456                 node.getEndPosition() + node.getEndTagLength())];
457     }
458 
459     /**
460     * gets the Inner-HTML from the given node
461     *
462     * Params:
463     *    node = The Node from which you want to get the Inner-HTML
464     */
465     public string getInner(Node node)
466     {
467         return (node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength())) ? this.haystack[(
468                 node.getStartPosition() + node.getStartTagLength()) .. (node.getEndPosition())] : "";
469     }
470 
471     /**
472     * Removes tags and returns plain inner content
473     */
474     public string stripTags(Node node) {
475         import std.algorithm.searching : any;
476         import std.array : appender;
477         auto inner = appender!string();
478         Node[] descendants = node.getDescendants();
479         for(size_t i = node.getStartPosition + node.getStartTagLength ; i < node.getEndPosition ; i++) {
480             if( !
481                 any!(desc =>
482                 isBetween(i , desc.getStartPosition()-1 , desc.getStartPosition()+desc.getStartTagLength())
483                 || isBetween(i , desc.getEndPosition()-1 , desc.getEndPosition()+desc.getEndTagLength())
484                 )(descendants)
485             ) {
486                 inner.put(this.haystack[i]);
487             }
488         }
489         return inner.data;
490     }
491 
492     /**
493     * Removes tags and returns plain inner content
494     */
495     public string stripTags() {
496         if( ! this.nodes.length) {
497             return "";
498         }
499         return this.stripTags((*this.nodes.ptr));
500     }
501     ///
502     unittest {
503         const string content = `<div><h2>bla</h2><p>fasel</p></div>`;
504         Dominator dom = new Dominator(content);
505         assert( dom.stripTags() == "blafasel", dom.stripTags());
506     }
507 }
508 
509 unittest {
510     const string content = `<div data-function=">">
511         <ol id="ol-1">
512           <li id="li-1-ol-1">li-1-ol-1 Inner</li>
513           <li id="li-2-ol-1">li-2-ol-1 Inner</li>
514           <li id="li-3-ol-1">li-3-ol-1 Inner</li>
515         </ol>
516       </div>`;
517       Dominator dom = new Dominator(content);
518       assert( dom.getNodes.length == 5);
519       assert( dom.filterDom(DomFilter("ol")).length == 1 );
520       assert( dom.filterDom(DomFilter("ol.li")).length == 3 );
521       assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 );
522 }
523 
524 /// get descendants of a specific Node and apply further filtering on the result.
525 unittest {
526     const string content = `<div data-function="<some weird> stuff">
527         <span>
528             <span>
529                 <span>bäm!</span>
530             </span>
531             <span>boing!</span>
532         </span>
533         <ol id="ol-1">
534           <li id="li-1-ol-1">li-1-ol-1 Inner</li>
535           <li id="li-2-ol-1">li-2-ol-1 Inner</li>
536           <li id="li-3-ol-1">li-3-ol-1 Inner</li>
537         </ol>
538       </div>`;
539 
540       Dominator dom = new Dominator(content);
541       Node [] descendants = (*dom.filterDom("div").ptr).getDescendants();
542       assert( descendants.filterDom("span").length == 4 );
543       assert( descendants.filterDom("li").length == 3 );
544       assert( descendants.filterDom("ol").length == 1 );
545 }
546 
547 /// basic example
548 unittest {
549     const string html =
550     `<div>
551         <p>Here comes a list!</p>
552         <ul>
553             <li class="wanted">one</li>
554             <!-- <li>two</li> -->
555             <li class="wanted hard">three</li>
556             <li id="item-4">four</li>
557             <li checked>five</li>
558             <li id="item-6">six</li>
559         </ul>
560         <p>another list</p>
561         <ol>
562             <li>eins</li>
563             <li>zwei</li>
564             <li>drei</li>
565         </ol>
566         <p>have a nice day</p>
567     </div>`;
568     Dominator dom = new Dominator(html);
569 
570     foreach(node ; dom.filterDom("ul.li")) {
571         //do something more usefull with the node then:
572         assert(node.getParent.getTag() == "ul");
573     }
574 
575     Node[] nodes = dom.filterDom("ul.li");
576     assert(dom.getInner( nodes[0] ) == "one" );
577     assert(nodes[0].getAttributes() == [ Attribute("class","wanted") ] , to!(string)(nodes[0].getAttributes()) );
578     assert(Attribute("class","wanted").matches(nodes[0]));
579     assert(Attribute("class","wanted").matches(nodes[2]));
580     assert(Attribute("class",["wanted","hard"]).matches(nodes[2]));
581     assert(nodes[1].isComment());
582 
583     assert(dom.filterDom("ul.li").length == 6);
584     assert(dom.filterDom("ul.li").filterComments.length == 5);
585     assert(dom.filterDom("li").length == 9);
586     assert(dom.filterDom("li[1]").length == 1); //the first li in the dom
587     assert(dom.filterDom("*.li[1]").length == 2); //the first li in ul and first li in ol
588     assert(dom.getInner( (*dom.filterDom("*{checked:}").ptr) ) == "five");
589 
590 }
591 
592 unittest {
593     Dominator dom = new Dominator(readText("dummy.html"));
594     auto filter = DomFilter("article");
595     assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length));
596     assert( dom.filterDom(filter).length == 6);
597 
598     assert( dom.filterDom("div.*.ol.li").length == 3 );
599     assert( dom.filterDom("div.ol.li").length == 6 );
600     assert( dom.filterDom("ol.li").length == 6 );
601     assert( dom.filterDom(`ol.li{id:(regex)^li-[\d]+}`).length == 6 );
602     assert( dom.filterDom(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`).length == 3 );
603     assert( dom.filterDom(`*{checked:}`).length == 1 );
604     assert( dom.filterDom(`onelinenested`).length == 2 );
605     assert( dom.filterDom(`onelinenested{class:level1}`).length == 1 );
606     assert( dom.filterDom(`onelinenested{class:level2}`).length == 1 );
607     assert( dom.filterDom(`onelinenested.onelinenested`).length == 1 );
608 
609     /**
610     * Find nodes with a special href.
611     */
612     filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`);
613     assert( dom.filterDom(filter).length);
614     foreach(Node foundNode ; dom.filterDom(filter)) {
615         assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) );
616     }
617 
618     /**
619     * Find nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks.
620     */
621     filter = DomFilter(`*{href://www.google.com/}`);
622     assert( dom.filterDom(filter).length);
623     foreach(Node foundNode ; dom.filterDom(filter)) {
624         assert( Attribute("href","//www.google.com/").matches(foundNode) );
625     }
626 }