I want to strip image URLs from lots of differently formed HTML.
I have this already:
NSRegularExpression *regex = [[NSRegularExpression alloc]
initWithPattern:@"(?<=img src=\").*?(?=\")"
options:NSRegularExpressionCaseInsensitive error:nil];
this works fine if the HTML is formed like <img src="someurl.jpg" alt="" .../>
,开发者_高级运维 but this isn't always the case, sometimes the there are other attributes before src
which is doesn't pick up.
Its a difficult thing to do with regular expressions. You are generally better off with using an XMLParser and XPath. However, if the HTML isn't very valid (even if you use TidyHTML), you can find that XPath just won't work very well.
If you must look for images using regular expressions, I would suggest something like:
<\\s*?img\\s+[^>]*?\\s*src\\s*=\\s*([\"\'])((\\\\?+.)*?)\\1[^>]*?>
So assuming you have rawHTML in a string with the same name, use:
NSRegularExpression* regex = [[NSRegularExpression alloc] initWithPattern:@"<\\s*?img\\s+[^>]*?\\s*src\\s*=\\s*([\"\'])((\\\\?+.)*?)\\1[^>]*?>" options:NSRegularExpressionCaseInsensitive error:nil];
NSArray *imagesHTML = [regex matchesInString:rawHTML options:0 range:NSMakeRange(0, [rawHTML length])];
[regex release];
If you want to get out the actual image URL from the source then I'd use something like (run over the output from previous regex):
(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]
Yeah, I know, crazy! But you did ask :-)
Credit: That final regex is from John Gruber/Daring Fireball.
This is some code I've written in the past that returns an array of NSString url's of images. I use it when trying (as a last resort) to get image URL's from very broken HTML:
- (NSArray *)extractSuitableImagesFromRawHTMLEntry:(NSString *)rawHTML {
NSMutableArray *images = [[NSMutableArray alloc] init];
if(rawHTML!=nil&&[rawHTML length]!=0) {
NSRegularExpression* regex = [[NSRegularExpression alloc] initWithPattern:@"<\\s*?img\\s+[^>]*?\\s*src\\s*=\\s*([\"\'])((\\\\?+.)*?)\\1[^>]*?>" options:NSRegularExpressionCaseInsensitive error:nil];
NSArray *imagesHTML = [regex matchesInString:rawHTML options:0 range:NSMakeRange(0, [rawHTML length])];
[regex release];
for (NSTextCheckingResult *image in imagesHTML) {
NSString *imageHTML = [rawHTML substringWithRange:image.range];
NSRegularExpression* regex2 = [[NSRegularExpression alloc] initWithPattern:@"(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))" options:NSRegularExpressionCaseInsensitive error:nil];
NSArray *imageSource=[regex2 matchesInString:imageHTML options:0 range:NSMakeRange(0, [imageHTML length])];
[regex2 release];
NSString *imageSourceURLString=nil;
for (NSTextCheckingResult *result in imageSource) {
NSString *str=[imageHTML substringWithRange:result.range];
//DebugLog(@"url is %@",str);
if([str hasPrefix:@"http"]) {
//strip off any crap after file extension
//find jpg
NSRange r1=[str rangeOfString:@".jpg" options:NSBackwardsSearch&&NSCaseInsensitiveSearch];
if(r1.location==NSNotFound) {
//find jpeg
NSRange r2=[str rangeOfString:@".jpeg" options:NSBackwardsSearch&&NSCaseInsensitiveSearch];
if(r2.location==NSNotFound) {
//find png
NSRange r3=[str rangeOfString:@".png" options:NSBackwardsSearch&&NSCaseInsensitiveSearch];
if(r3.location==NSNotFound) {
break;
} else {
imageSourceURLString=[str substringWithRange:NSMakeRange(0, r3.location+r3.length)];
}
} else {
//jpeg was found
imageSourceURLString=[str substringWithRange:NSMakeRange(0, r2.location+r2.length)];
break;
}
} else {
//jpg was found
imageSourceURLString=[str substringWithRange:NSMakeRange(0, r1.location+r1.length)];
break;
}
}
}
if(imageSourceURLString==nil) {
//DebugLog(@"No image found.");
} else {
DebugLog(@"*** image found: %@", imageSourceURLString);
NSURL *imageURL=[NSURL URLWithString:imageSourceURLString];
if(imageURL!=nil) {
[images addObject:imageURL];
}
}
}
}
return [images autorelease];
}
精彩评论