Skip to content

Commit 3022102

Browse files
committed
Port recent changes to forked HTMLReader repository. Work hour: 2.0 hour
1 parent bdcf5d5 commit 3022102

File tree

1 file changed

+228
-12
lines changed

1 file changed

+228
-12
lines changed

Code/HTMLSelector.m

Lines changed: 228 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
typedef BOOL (^HTMLSelectorPredicate)(HTMLElement *node);
1111
typedef HTMLSelectorPredicate HTMLSelectorPredicateGen;
1212

13+
static HTMLSelectorPredicate ScanSelectorPredicate(NSScanner *scanner, NSError **error);
1314
static HTMLSelectorPredicate SelectorFunctionForString(NSString *selectorString, NSError **error);
1415

1516
static NSError * ParseError(NSString *reason, NSString *string, NSUInteger position)
@@ -67,6 +68,18 @@ HTMLSelectorPredicateGen bothCombinatorPredicate(HTMLSelectorPredicate a, HTMLSe
6768
};
6869
}
6970

71+
HTMLSelectorPredicateGen eitherCombinatorPredicate(HTMLSelectorPredicate a, HTMLSelectorPredicate b)
72+
{
73+
//There was probably an error somewhere else
74+
//in parsing, so return nil here
75+
if (!a && !b) return nil;
76+
77+
return ^BOOL(HTMLElement *node)
78+
{
79+
return a(node) || b(node);
80+
};
81+
}
82+
7083
HTMLSelectorPredicateGen andCombinatorPredicate(NSArray *predicates)
7184
{
7285
return ^(HTMLElement *node) {
@@ -316,6 +329,73 @@ HTMLSelectorPredicateGen isLastChildOfTypePredicate(HTMLSelectorPredicate typePr
316329
return isNthChildOfTypePredicate(HTMLNthExpressionMake(0, 1), typePredicate, YES);
317330
}
318331

332+
HTMLSelectorPredicateGen isContainsStringPredicate(NSString* str)
333+
{
334+
return ^BOOL(HTMLElement *node) {
335+
return [[node textContent] containsString:str];
336+
};
337+
}
338+
339+
//This is just to check if it has a child matching insideAfter predicate
340+
HTMLSelectorPredicateGen isAfterTagPredicate(HTMLSelectorPredicate insideAfter)
341+
{
342+
return ^BOOL(HTMLElement *node) {
343+
344+
for (HTMLElement *child in node.childElementNodes) {
345+
if (insideAfter(child)) {
346+
return YES;
347+
}
348+
}
349+
return NO;
350+
};
351+
}
352+
353+
// This is just to check if there are two childres matching left and right predicate
354+
HTMLSelectorPredicateGen isBetweenTagPredicate(HTMLSelectorPredicate left, HTMLSelectorPredicate right)
355+
{
356+
return ^BOOL(HTMLElement *node) {
357+
BOOL leftRet = YES;
358+
for (HTMLElement *child in node.childElementNodes) {
359+
if(left(child)) {
360+
leftRet = YES;
361+
continue;
362+
}
363+
if(right(child) && leftRet) {
364+
return YES;
365+
}
366+
}
367+
return NO;
368+
};
369+
}
370+
371+
// For enumerate all child to see if anyone matches childPredicate
372+
HTMLSelectorPredicateGen hasPredicate(HTMLSelectorPredicate childPredicate)
373+
{
374+
if (!childPredicate) return nil;
375+
376+
return ^BOOL(HTMLElement *node) {
377+
NSMutableArray *queue = [node.childElementNodes mutableCopy];
378+
379+
while (queue.count > 0)
380+
{
381+
HTMLElement *curNode = [queue firstObject];
382+
if(childPredicate(curNode))
383+
{
384+
return YES;
385+
}
386+
else
387+
{
388+
for (HTMLElement *child in curNode.childElementNodes) {
389+
[queue addObject:child];
390+
}
391+
}
392+
[queue removeObjectAtIndex:0];
393+
}
394+
395+
return NO;
396+
};
397+
}
398+
319399
#pragma mark Attribute Helpers
320400

321401
HTMLSelectorPredicateGen isKindOfClassPredicate(NSString *classname)
@@ -457,6 +537,20 @@ HTMLSelectorPredicateGen isRootPredicate(void)
457537
return nil;
458538
}
459539

540+
//TODO: Handle nested brackets more robustly.
541+
// Get the nested interior if there are nested brackets ..(..(..)..)..
542+
// Count the number of ( between the first ( and the first ) and scan the same number of )
543+
NSInteger times = [[interior componentsSeparatedByString:@"("] count]-1;
544+
while (times > 0 && scanner.scanLocation)
545+
{
546+
[scanner scanString:@")" intoString:nil];
547+
NSString *interior2 = nil;
548+
[scanner scanUpToString:@")" intoString:&interior2];
549+
interior = [interior stringByAppendingString:@")"];
550+
interior = interior2 != nil ? [interior stringByAppendingString:interior2] : interior;
551+
times--;
552+
}
553+
460554
[scanner scanString:@")" intoString:nil];
461555
return interior;
462556
}
@@ -541,11 +635,51 @@ static HTMLSelectorPredicateGen scanPredicateFromPseudoClass(NSScanner *scanner,
541635
return isNthChildOfTypePredicate(nth, typePredicate, YES);
542636
}
543637
}
638+
else if ([pseudo isEqualToString:@"contains"]){
639+
NSString *interior = scanFunctionInterior(scanner, error);
640+
641+
if (!interior) return nil;
642+
NSString *str = [interior stringByTrimmingCharactersInSet:[NSCharacterSet characterSetWithCharactersInString:@"'"]];
643+
return isContainsStringPredicate(str);
644+
}
544645
else if ([pseudo isEqualToString:@"not"]) {
545646
NSString *toNegateString = scanFunctionInterior(scanner, error);
546647
HTMLSelectorPredicate toNegate = SelectorFunctionForString(toNegateString, error);
547648
return negatePredicate(toNegate);
548649
}
650+
else if ([pseudo isEqualToString:@"has"]) {
651+
NSString *interior = scanFunctionInterior(scanner, error);
652+
if (!interior) return nil;
653+
HTMLSelectorPredicate insideHas = SelectorFunctionForString(interior, error);
654+
655+
return hasPredicate(insideHas);
656+
}
657+
else if ([pseudo isEqualToString:@"after"] || [pseudo isEqualToString:@"before"]) {
658+
NSString *interior = scanFunctionInterior(scanner, error);
659+
if (!interior) return nil;
660+
HTMLSelectorPredicate insideAfter = SelectorFunctionForString(interior, error);
661+
662+
return isAfterTagPredicate(insideAfter);
663+
}
664+
else if ([pseudo isEqualToString:@"between"]) {
665+
NSString *interior = scanFunctionInterior(scanner, error);
666+
if (!interior) return nil;
667+
NSCharacterSet *whitespace = [NSCharacterSet whitespaceAndNewlineCharacterSet];
668+
NSArray *valueSplit = [[interior stringByTrimmingCharactersInSet:whitespace] componentsSeparatedByCharactersInSet:[NSCharacterSet characterSetWithCharactersInString:@";"]];
669+
670+
if (valueSplit.count != 2)
671+
{
672+
673+
NSLog(@"INFO: Number of strings after split by ';' is not 2 : %lu", (unsigned long)valueSplit.count);
674+
return neverPredicate();
675+
}
676+
677+
HTMLSelectorPredicate left = SelectorFunctionForString(valueSplit[0] , error);
678+
679+
HTMLSelectorPredicate right = SelectorFunctionForString(valueSplit[1] , error);
680+
681+
return isBetweenTagPredicate(left, right);
682+
}
549683

550684
*error = ParseError(@"Unrecognized pseudo class", scanner.string, scanner.scanLocation);
551685
return nil;
@@ -577,7 +711,7 @@ static HTMLSelectorPredicateGen scanPredicateFromPseudoClass(NSScanner *scanner,
577711
static dispatch_once_t onceToken;
578712
dispatch_once(&onceToken, ^{
579713
// Combinators are: whitespace, "greater-than sign" (U+003E, >), "plus sign" (U+002B, +) and "tilde" (U+007E, ~)
580-
NSMutableCharacterSet *set = [NSMutableCharacterSet characterSetWithCharactersInString:@">+~"];
714+
NSMutableCharacterSet *set = [NSMutableCharacterSet characterSetWithCharactersInString:@",>+~"];
581715
[set formUnionWithCharacterSet:HTMLSelectorWhitespaceCharacterSet()];
582716
frozenSet = [set copy];
583717
});
@@ -733,7 +867,9 @@ HTMLSelectorPredicateGen scanPredicate(NSScanner *scanner, HTMLSelectorPredicate
733867
//Whitespace combinator
734868
//y descendant of an x
735869
return descendantOfPredicate(inputPredicate);
736-
} else if ([combinator isEqualToString:@">"]) {
870+
} else if ([combinator isEqualToString:@","]) {
871+
return eitherCombinatorPredicate(inputPredicate, ScanSelectorPredicate(scanner, error));
872+
} else if ([combinator isEqualToString:@">"]) {
737873
return childOfOtherPredicatePredicate(inputPredicate);
738874
} else if ([combinator isEqualToString:@"+"]) {
739875
return adjacentSiblingPredicate(inputPredicate);
@@ -750,6 +886,20 @@ HTMLSelectorPredicateGen scanPredicate(NSScanner *scanner, HTMLSelectorPredicate
750886
}
751887
}
752888

889+
static HTMLSelectorPredicate ScanSelectorPredicate(NSScanner *scanner, NSError **error)
890+
{
891+
//Scan out predicate parts and combine them
892+
HTMLSelectorPredicate lastPredicate = nil;
893+
894+
do{
895+
lastPredicate = scanPredicate(scanner, lastPredicate, error);
896+
} while (lastPredicate && ![scanner isAtEnd] && !*error);
897+
898+
NSCAssert(lastPredicate || *error, @"Need either a predicate or error at this point");
899+
900+
return lastPredicate;
901+
}
902+
753903
static HTMLSelectorPredicate SelectorFunctionForString(NSString *selectorString, NSError **error)
754904
{
755905
//Trim non-functional whitespace
@@ -765,16 +915,7 @@ static HTMLSelectorPredicate SelectorFunctionForString(NSString *selectorString,
765915
scanner.caseSensitive = NO; //Section 3 states that in HTML parsing, selectors are case-insensitive
766916
scanner.charactersToBeSkipped = nil;
767917

768-
//Scan out predicate parts and combine them
769-
HTMLSelectorPredicate lastPredicate = nil;
770-
771-
do{
772-
lastPredicate = scanPredicate(scanner, lastPredicate, error);
773-
} while (lastPredicate && ![scanner isAtEnd] && !*error);
774-
775-
NSCAssert(lastPredicate || *error, @"Need either a predicate or error at this point");
776-
777-
return lastPredicate;
918+
return ScanSelectorPredicate(scanner, error);
778919
}
779920

780921
@interface HTMLSelector ()
@@ -858,12 +999,87 @@ - (HTMLElement *)firstNodeMatchingParsedSelector:(HTMLSelector *)selector
858999

8591000
for (HTMLElement *node in self.treeEnumerator) {
8601001
if ([node isKindOfClass:[HTMLElement class]] && [selector matchesElement:node]) {
1002+
//Return children before the predicate inside before(...)
1003+
if([selector.string containsString:@":before("]) {
1004+
NSString *interior = [self stringBetweenString:@":before(" andString:@")" withString:selector.string];
1005+
NSError *error;
1006+
HTMLSelectorPredicate predicate = SelectorFunctionForString(interior, &error);
1007+
HTMLElement *mutableNode = [node copy];
1008+
for (HTMLNode *mNode in node.children) {
1009+
if ([mNode isKindOfClass:[HTMLElement class]] && predicate((HTMLElement *)mNode)) {
1010+
break;
1011+
}
1012+
[mutableNode.mutableChildren addObject:mNode];
1013+
}
1014+
return mutableNode;
1015+
}
1016+
1017+
//Return children after the predicate inside after(...)
1018+
if([selector.string containsString:@":after("]) {
1019+
NSString *interior = [self stringBetweenString:@":after(" andString:@")" withString:selector.string];
1020+
NSError *error;
1021+
HTMLSelectorPredicate predicate = SelectorFunctionForString(interior, &error);
1022+
HTMLElement *mutableNode = [node copy];
1023+
BOOL shouldAdd = NO;
1024+
for (HTMLNode *mNode in node.children) {
1025+
if (shouldAdd) {
1026+
[mutableNode.mutableChildren addObject:mNode];
1027+
continue;
1028+
}
1029+
if ([mNode isKindOfClass:[HTMLElement class]] && predicate((HTMLElement *)mNode)) {
1030+
shouldAdd = YES;
1031+
}
1032+
}
1033+
return mutableNode;
1034+
}
1035+
1036+
//Return children between the predicate inside between(...)
1037+
if([selector.string containsString:@":between("]) {
1038+
NSString *interior = [self stringBetweenString:@":between(" andString:@")" withString:selector.string];
1039+
NSError *error;
1040+
1041+
NSCharacterSet *whitespace = [NSCharacterSet whitespaceAndNewlineCharacterSet];
1042+
NSArray *valueSplit = [[interior stringByTrimmingCharactersInSet:whitespace] componentsSeparatedByCharactersInSet:[NSCharacterSet characterSetWithCharactersInString:@";"]];
1043+
1044+
HTMLSelectorPredicate left = SelectorFunctionForString(valueSplit[0] , &error);
1045+
HTMLSelectorPredicate right = SelectorFunctionForString(valueSplit[1] , &error);
1046+
HTMLElement *mutableNode = [node copy];
1047+
BOOL shouldAdd = NO;
1048+
for (HTMLNode *mNode in node.children) {
1049+
if (shouldAdd && [mNode isKindOfClass:[HTMLElement class]] && right((HTMLElement *)mNode)) {
1050+
break;
1051+
}
1052+
if (shouldAdd) {
1053+
[mutableNode.mutableChildren addObject:mNode];
1054+
continue;
1055+
}
1056+
if ([mNode isKindOfClass:[HTMLElement class]] && left((HTMLElement *)mNode)) {
1057+
shouldAdd = YES;
1058+
}
1059+
}
1060+
return mutableNode;
1061+
}
1062+
8611063
return node;
8621064
}
8631065
}
8641066
return nil;
8651067
}
8661068

1069+
-(NSString*)stringBetweenString:(NSString*)start andString:(NSString *)end withString:(NSString*)str
1070+
{
1071+
NSScanner* scanner = [NSScanner scannerWithString:str];
1072+
[scanner setCharactersToBeSkipped:nil];
1073+
[scanner scanUpToString:start intoString:NULL];
1074+
if ([scanner scanString:start intoString:NULL]) {
1075+
NSString* result = nil;
1076+
if ([scanner scanUpToString:end intoString:&result]) {
1077+
return result;
1078+
}
1079+
}
1080+
return nil;
1081+
}
1082+
8671083
@end
8681084

8691085
HTMLNthExpression HTMLNthExpressionMake(NSInteger n, NSInteger c)

0 commit comments

Comments
 (0)