libdominator.Dominator source code

1 /**
2  * Copyright:
3  * (C) 2016 Martin Brzenska
4  *
5  * License:
6  * Distributed under the terms of the MIT license.
7  * Consult the provided LICENSE.md file for details
8  */
9 module libdominator.Dominator;
10 
11 import std.regex : Regex , regex, matchAll , ctRegex;
12 import std.conv : to;
13 
14 import libdominator.Attribute;
15 import libdominator.Node;
16 
17 static Regex!char rNodeHead;
18 static Regex!char rAttrib;
19 static Regex!char rComment;
20 static this() {
21     rNodeHead = ctRegex!(`<[\s]*([\w\d-]+)`, "i");
22     rAttrib = ctRegex!(`([\w\d-_]+)(?:=((")(?:\\"|[^"])*"|(')(?:\\'|[^'])*'|(?:\\[\s]|[^\s])*([\s])*))?`, "i");
23     rComment = ctRegex!(`<!--.*?-->`, "s");
24 }
25 
26 struct comment
27 {
28     uint begin;
29     uint end;
30 }
31 
32 struct terminator
33 {
34     uint position;
35     ushort length;
36 }
37 
38 bool isBetween(size_t needle, size_t from, size_t to)
39 {
40     return (needle > from && needle < to);
41 }
42 
43 ///Parse, hierarchize, analyse xhtml
44 class Dominator
45 {
46     //private auto rComment = regex(`<!--.*?-->`, "s");
47     private string haystack;
48     private comment[] comments;
49 
50     private Node[] nodes;
51 
52     /**
53      Instantiate empty Dominator
54     */
55     this() {}
56 
57     /**
58      Instantiate object and load the Document
59     */
60     this(string haystack)
61     {
62         this.load(haystack);
63     }
64 
65     /**
66      loads a Document
67      Params:
68        haystack = the Document String
69     */
70     public Dominator load(string haystack)
71     {
72         this.haystack = haystack;
73         this.parse();
74         return this;
75     }
76 
77     private size_t findNodesLenth(size_t nodeHeadPos) {
78         size_t len;
79         bool hasQuoteMarkFailure;
80         bool openSingleQuoteMark;
81         bool openDoubleQuoteMark;
82         char inQuote = 0x00;
83         while(this.haystack.length > nodeHeadPos + len)
84         {
85             if(
86                 this.haystack[nodeHeadPos + len] == '>'
87                 && inQuote == 0x00
88             ) {
89                 return 1+len;
90             }
91             len++;
92             if(
93                 this.haystack[nodeHeadPos + len] == inQuote
94                 && this.haystack[nodeHeadPos + len -1] != '\\'
95             ) {
96                 inQuote = 0x00;
97             }
98             else if(
99                 inQuote == 0x00
100                 && (this.haystack[nodeHeadPos + len] == '\''
101                 || this.haystack[nodeHeadPos + len] == '"')
102             ) {
103                 inQuote = this.haystack[nodeHeadPos + len];
104             }
105         }
106         return 1+len; //we should never get here
107     }
108 
109     private void parse()
110     {
111         import std.string : chomp, chompPrefix;
112         foreach (mComment; matchAll(this.haystack, rComment))
113         {
114             this.comments ~= comment(to!uint(mComment.pre().length),
115                     to!uint(mComment.pre().length) + to!uint(mComment.front.length));
116         }
117         terminator[][string] terminators;
118         foreach (mNode; matchAll(this.haystack, rNodeHead))
119         {
120             auto node = new Node();
121             /*
122             * We have a good candidate for a opening node.
123             * We know the beginning position (and how long the "head" is)
124             * Next, we need to find out where the opener ends.
125             */
126             mNode.popFront();
127             node.setTag(mNode.front)
128                 .setStartPosition(mNode.pre().length);
129             node.setStartTagLength(
130                 node.getTag().length + this.findNodesLenth( node.getTag().length + node.getStartPosition() )
131             );
132 
133             //check if this node is inside of a comment - if yes, mark it.
134             foreach (comment cmnt; this.comments)
135             {
136                 if (isBetween(node.getStartPosition(), cmnt.begin, cmnt.end))
137                 {
138                     node.isComment(true);
139                 }
140             }
141 
142             //parse the attributes, if there are one or more
143             if(
144                 node.getStartPosition + mNode.hit().length + 1
145                 <
146                 node.getStartPosition + node.getStartTagLength() -1
147             )
148             {
149                 foreach (
150                     mAttrib;
151                     matchAll(
152                         this.haystack[
153                             node.getStartPosition + mNode.hit().length + 1
154                             ..
155                             node.getStartPosition + node.getStartTagLength() -1
156                             ],
157                         rAttrib
158                     )
159                 )
160                 {
161                     node.addAttribute(
162                         Attribute(
163                             mAttrib[1],
164                             chompPrefix(
165                                 chomp(
166                                     mAttrib[2],
167                                     mAttrib[3] ~ mAttrib[4] ~ mAttrib[5]
168                                 ),
169                                 mAttrib[3] ~ mAttrib[4] ~ mAttrib[5]
170                             )
171                         )
172                     );
173                 }
174             }
175             this.addNode(node);
176 
177             //search Terminator Candidates
178             if (node.getTag() !in terminators)
179             {
180                 foreach (mTerminatorCandi; matchAll(this.haystack[node.getStartPosition() .. $],
181                         regex(`<[\s]*/` ~ node.getTag() ~ `[\s]*>`,"i")))
182                 {
183                     terminators[node.getTag()] ~= terminator(node.getStartPosition() + to!uint(mTerminatorCandi.pre()
184                             .length), to!ushort(mTerminatorCandi.front.length));
185                 }
186                 if (node.getTag() !in terminators)
187                 {
188                     terminators[node.getTag()] ~= [terminator(node.getStartPosition(), 0)];
189                 }
190             }
191         }
192         this.hierarchize(terminators);
193     }
194 
195     private void addNode(Node node)
196     {
197         this.nodes ~= node;
198     }
199 
200     private void hierarchize(terminator[][string] terminators)
201     {
202         import std.algorithm : sort;
203 
204         bool[size_t] arrTerminatorBlacklist;
205         foreach_reverse (Node node; this.nodes)
206         {
207             terminator _lastTerm = terminator(node.getStartPosition(), 0);
208             bool isTerminated = false;
209             foreach_reverse (terminator terminatorCandi; terminators[node.getTag()])
210             {
211                 if (terminatorCandi.position in arrTerminatorBlacklist)
212                 {
213                     continue;
214                 }
215                 if (node.getStartPosition() > terminatorCandi.position)
216                 {
217                     arrTerminatorBlacklist[_lastTerm.position] = true;
218                     node.setEndPosition(_lastTerm.position).setEndTagLength(0);
219                     isTerminated = true;
220                     break;
221                 }
222                 else
223                 {
224                     _lastTerm = terminatorCandi;
225                 }
226             }
227             if (!isTerminated)
228             {
229                 node.setEndPosition(_lastTerm.position).setEndTagLength(_lastTerm.length);
230                 arrTerminatorBlacklist[_lastTerm.position] = true;
231             }
232             if (node.getEndTagLength == 0)
233             {
234                 //No Terminator has been found - we assume, that this is a self-terminator
235                 node.setEndTagLength(node.getStartTagLength());
236             }
237         }
238 
239         Node[] sortedNodes;
240         foreach (node; sort!"a.getStartPosition() < b.getStartPosition()"(this.nodes))
241         {
242             sortedNodes ~= node;
243         }
244         this.nodes = sortedNodes.dup;
245         delete sortedNodes;
246 
247         foreach (size_t i, Node node; this.nodes)
248         {
249             for (size_t back = 1; back <= i; back++)
250             {
251                 if (this.nodes[i - back].getEndPosition() > node.getEndPosition())
252                 {
253                     node.setParent(&this.nodes[i - back]);
254                     node.getParent().addChild(&this.nodes[i]);
255                     break;
256                 }
257             }
258         }
259     }
260 
261     /**
262      returns all found Nodes.
263      Please note, that also Nodes will be returned which was found in comments.
264      use isComment() to check if a Node is in a comment or use libdominator.Filter.filterComments()
265 
266      returns:
267       Nodes[]
268     */
269     public Node[] getNodes()
270     {
271         return this.nodes;
272     }
273 
274     /**
275      gets the Tag Name of the Node
276 
277      Params:
278         node = The Node to get the Tag Name from
279 
280      Returns:
281       The Tag Name as string
282     */
283     public string getStartElement(Node node)
284     {
285         return this.haystack[node.getStartPosition() .. (
286                 node.getStartPosition() + node.getStartTagLength())];
287     }
288 
289     /**
290      gets the part of the loaded Document from the nodes begining to its end
291 
292      Params:
293         node = The Node from which you want to get the Document representation
294     */
295     public string getElelment(Node node)
296     {
297         return this.haystack[node.getStartPosition() .. (
298                 node.getEndPosition() + node.getEndTagLength())];
299     }
300 
301     /**
302      gets the Inner-HTML from the given node
303 
304      Params:
305         node = The Node from which you want to get the Inner-HTML
306     */
307     public string getInner(Node node)
308     {
309         return (node.getEndPosition() > (node.getStartPosition() + node.getStartTagLength())) ? this.haystack[(
310                 node.getStartPosition() + node.getStartTagLength()) .. (node.getEndPosition())] : "";
311     }
312 }
313 
314 version(unittest) {
315     import libdominator.Filter;
316     import std.file;
317 }
318 
319 unittest {
320     const string content = `<div data-function=">">
321         <ol id="ol-1">
322           <li id="li-1-ol-1">li-1-ol-1 Inner</li>
323           <li id="li-2-ol-1">li-2-ol-1 Inner</li>
324           <li id="li-3-ol-1">li-3-ol-1 Inner</li>
325         </ol>
326       </div>`;
327       Dominator dom = new Dominator(content);
328       assert( dom.getNodes.length == 5);
329       assert( dom.filterDom(DomFilter("ol")).length == 1 );
330       assert( dom.filterDom(DomFilter("ol.li")).length == 3 );
331       assert( dom.filterDom(DomFilter("ol.li{id:li-3-ol-1}")).length == 1 );
332 }
333 
334 unittest {
335     import std.conv : to;
336     Dominator dom = new Dominator(readText("dummy.html"));
337     auto filter = DomFilter("article");
338     assert( dom.filterDom(filter).filterComments().length == 3 , to!(string)(dom.filterDom(filter).filterComments().length) );
339     assert( dom.filterDom(filter).length == 6 , to!(string)(dom.filterDom(filter).length));
340 
341     filter = DomFilter("div.*.ol.li");
342     assert( dom.filterDom(filter).length == 3 );
343 
344     filter = DomFilter("div.ol.li");
345     assert( dom.filterDom(filter).length == 6 );
346 
347     filter = DomFilter("ol.li");
348     assert( dom.filterDom(filter).length == 6 );
349 
350     filter = DomFilter(`ol.li{id:(regex)^li-[\d]+}`);
351     assert( dom.filterDom(filter).length == 6 );
352 
353     filter = DomFilter(`ol{id:ol-1}.li{id:(regex)^li-[\d]+}`);
354     assert( dom.filterDom(filter).length == 3 );
355 
356     filter = DomFilter(`*{checked:}`);
357     assert( dom.filterDom(filter).length == 1 );
358 
359     filter = DomFilter(`onelinenested`);
360     assert( dom.filterDom(filter).length == 2 );
361 
362     filter = DomFilter(`onelinenested{class:level1}`);
363     assert( dom.filterDom(filter).length == 1 );
364 
365     filter = DomFilter(`onelinenested{class:level2}`);
366     assert( dom.filterDom(filter).length == 1 );
367 
368     filter = DomFilter(`onelinenested.onelinenested`);
369     assert( dom.filterDom(filter).length == 1 );
370 
371     /**
372     * Find the nodes with a special href.
373     */
374     filter = DomFilter(`*{href:https://www.google.com/support/contact/user?hl=en}`);
375     assert( dom.filterDom(filter).length);
376     foreach(Node foundNode ; dom.filterDom(filter)) {
377         assert (Attribute("href","https://www.google.com/support/contact/user?hl=en").matches(foundNode) );
378     }
379 
380     /**
381     * Find the nodes with a special href - In HTML5 it is ok to have attribute-values without quotation marks.
382     */
383     filter = DomFilter(`*{href://www.google.com/}`);
384     assert( dom.filterDom(filter).length);
385     foreach(Node foundNode ; dom.filterDom(filter)) {
386         assert( Attribute("href","//www.google.com/").matches(foundNode) );
387     }
388 }