iphone - italiano - ios html app



Decodifica di caratteri HTML in Objective-C/Cocoa Touch (9)

A partire da iOS 7, è possibile decodificare i caratteri HTML in modo nativo utilizzando un NSAttributedString con l'attributo NSHTMLTextDocumentType :

NSString *htmlString = @" & & < > ™ © ♥ ♣ ♠ ♦";
NSData *stringData = [htmlString dataUsingEncoding:NSUTF8StringEncoding];

NSDictionary *options = @{NSDocumentTypeDocumentAttribute:NSHTMLTextDocumentType};
NSAttributedString *decodedString;
decodedString = [[NSAttributedString alloc] initWithData:stringData
                                                 options:options
                                      documentAttributes:NULL
                                                   error:NULL];

La stringa attribuita decodificata verrà ora visualizzata come:  & & <> ™ © ♥ ♣ ♠ ♦.

Nota: funziona solo se chiamato sul thread principale.

https://ffff65535.com

Prima di tutto, ho trovato questo: , ma non funziona per me.

I miei caratteri codificati (provengono da un feed RSS, btw) assomigliano a questo: &#038;

Ho cercato su tutta la rete e ho trovato discussioni correlate, ma nessuna correzione per la mia particolare codifica, penso che siano chiamate caratteri esadecimali.


Come se avessi bisogno di un'altra soluzione! Questo è abbastanza semplice e abbastanza efficace:

@interface NSString (NSStringCategory)
- (NSString *) stringByReplacingISO8859Codes;
@end


@implementation NSString (NSStringCategory)
- (NSString *) stringByReplacingISO8859Codes
{
    NSString *dataString = self;
    do {
        //*** See if string contains &# prefix
        NSRange range = [dataString rangeOfString: @"&#" options: NSRegularExpressionSearch];
        if (range.location == NSNotFound) {
            break;
        }
        //*** Get the next three charaters after the prefix
        NSString *isoHex = [dataString substringWithRange: NSMakeRange(range.location + 2, 3)];
        //*** Create the full code for replacement
        NSString *isoString = [NSString stringWithFormat: @"&#%@;", isoHex];
        //*** Convert to decimal integer
        unsigned decimal = 0;
        NSScanner *scanner = [NSScanner scannerWithString: [NSString stringWithFormat: @"0%@", isoHex]];
        [scanner scanHexInt: &decimal];
        //*** Use decimal code to get unicode character
        NSString *unicode = [NSString stringWithFormat:@"%C", decimal];
        //*** Replace all occurences of this code in the string
        dataString = [dataString stringByReplacingOccurrencesOfString: isoString withString: unicode];
    } while (TRUE); //*** Loop until we hit the NSNotFound

    return dataString;
}
@end

Dovrei postare questo su GitHub o qualcosa del genere. Questo va in una categoria di NSString, utilizza NSScanner per l'implementazione e gestisce sia le entità di carattere numerico esadecimale che quelle numeriche decimali, nonché le solite simboliche.

Inoltre, gestisce le stringhe malformate (quando si ha una sequenza di caratteri non seguita da una sequenza non valida) con relativa grazia, che si è rivelata fondamentale nella mia app rilasciata che utilizza questo codice.

- (NSString *)stringByDecodingXMLEntities {
    NSUInteger myLength = [self length];
    NSUInteger ampIndex = [self rangeOfString:@"&" options:NSLiteralSearch].location;

    // Short-circuit if there are no ampersands.
    if (ampIndex == NSNotFound) {
        return self;
    }
    // Make result string with some extra capacity.
    NSMutableString *result = [NSMutableString stringWithCapacity:(myLength * 1.25)];

    // First iteration doesn't need to scan to & since we did that already, but for code simplicity's sake we'll do it again with the scanner.
    NSScanner *scanner = [NSScanner scannerWithString:self];
    do {
        // Scan up to the next entity or the end of the string.
        NSString *nonEntityString;
        if ([scanner scanUpToString:@"&" intoString:&nonEntityString]) {
            [result appendString:nonEntityString];
        }
        if ([scanner isAtEnd]) {
            goto finish;
        }
        // Scan either a HTML or numeric character entity reference.
        if ([scanner scanString:@"&amp;" intoString:NULL])
            [result appendString:@"&"];
        else if ([scanner scanString:@"&apos;" intoString:NULL])
            [result appendString:@"'"];
        else if ([scanner scanString:@"&quot;" intoString:NULL])
            [result appendString:@"\""];
        else if ([scanner scanString:@"&lt;" intoString:NULL])
            [result appendString:@"<"];
        else if ([scanner scanString:@"&gt;" intoString:NULL])
            [result appendString:@">"];
        else if ([scanner scanString:@"&#" intoString:NULL]) {
            BOOL gotNumber;
            unsigned charCode;
            NSString *xForHex = @"";

            // Is it hex or decimal?
            if ([scanner scanString:@"x" intoString:&xForHex]) {
                gotNumber = [scanner scanHexInt:&charCode];
            }
            else {
                gotNumber = [scanner scanInt:(int*)&charCode];
            }
            if (gotNumber) {
                [result appendFormat:@"%C", charCode];
            }
            else {
                NSString *unknownEntity = @"";
                [scanner scanUpToString:@";" intoString:&unknownEntity];
                [result appendFormat:@"&#%@%@;", xForHex, unknownEntity];
                NSLog(@"Expected numeric character entity but got &#%@%@;", xForHex, unknownEntity);
            }
            [scanner scanString:@";" intoString:NULL];
        }
        else {
            NSString *unknownEntity = @"";
            [scanner scanUpToString:@";" intoString:&unknownEntity];
            NSString *semicolon = @"";
            [scanner scanString:@";" intoString:&semicolon];
            [result appendFormat:@"%@%@", unknownEntity, semicolon];
            NSLog(@"Unsupported XML character entity %@%@", unknownEntity, semicolon);
        }
    }
    while (![scanner isAtEnd]);

finish:
    return result;
}

Ecco una versione di Swift della risposta di Walty Yeung :

extension String {
    static private let mappings = ["&quot;" : "\"","&amp;" : "&", "&lt;" : "<", "&gt;" : ">","&nbsp;" : " ","&iexcl;" : "¡","&cent;" : "¢","&pound;" : " £","&curren;" : "¤","&yen;" : "¥","&brvbar;" : "¦","&sect;" : "§","&uml;" : "¨","&copy;" : "©","&ordf;" : " ª","&laquo" : "«","&not" : "¬","&reg" : "®","&macr" : "¯","&deg" : "°","&plusmn" : "±","&sup2; " : "²","&sup3" : "³","&acute" : "´","&micro" : "µ","&para" : "¶","&middot" : "·","&cedil" : "¸","&sup1" : "¹","&ordm" : "º","&raquo" : "»&","frac14" : "¼","&frac12" : "½","&frac34" : "¾","&iquest" : "¿","&times" : "×","&divide" : "÷","&ETH" : "Ð","&eth" : "ð","&THORN" : "Þ","&thorn" : "þ","&AElig" : "Æ","&aelig" : "æ","&OElig" : "Œ","&oelig" : "œ","&Aring" : "Å","&Oslash" : "Ø","&Ccedil" : "Ç","&ccedil" : "ç","&szlig" : "ß","&Ntilde;" : "Ñ","&ntilde;":"ñ",]

    func stringByDecodingXMLEntities() -> String {

        guard let _ = self.rangeOfString("&", options: [.LiteralSearch]) else {
            return self
        }

        var result = ""

        let scanner = NSScanner(string: self)
        scanner.charactersToBeSkipped = nil

        let boundaryCharacterSet = NSCharacterSet(charactersInString: " \t\n\r;")

        repeat {
            var nonEntityString: NSString? = nil

            if scanner.scanUpToString("&", intoString: &nonEntityString) {
                if let s = nonEntityString as? String {
                    result.appendContentsOf(s)
                }
            }

            if scanner.atEnd {
                break
            }

            var didBreak = false
            for (k,v) in String.mappings {
                if scanner.scanString(k, intoString: nil) {
                    result.appendContentsOf(v)
                    didBreak = true
                    break
                }
            }

            if !didBreak {

                if scanner.scanString("&#", intoString: nil) {

                    var gotNumber = false
                    var charCodeUInt: UInt32 = 0
                    var charCodeInt: Int32 = -1
                    var xForHex: NSString? = nil

                    if scanner.scanString("x", intoString: &xForHex) {
                        gotNumber = scanner.scanHexInt(&charCodeUInt)
                    }
                    else {
                        gotNumber = scanner.scanInt(&charCodeInt)
                    }

                    if gotNumber {
                        let newChar = String(format: "%C", (charCodeInt > -1) ? charCodeInt : charCodeUInt)
                        result.appendContentsOf(newChar)
                        scanner.scanString(";", intoString: nil)
                    }
                    else {
                        var unknownEntity: NSString? = nil
                        scanner.scanUpToCharactersFromSet(boundaryCharacterSet, intoString: &unknownEntity)
                        let h = xForHex ?? ""
                        let u = unknownEntity ?? ""
                        result.appendContentsOf("&#\(h)\(u)")
                    }
                }
                else {
                    scanner.scanString("&", intoString: nil)
                    result.appendContentsOf("&")
                }
            }

        } while (!scanner.atEnd)

        return result
    }
}

Nessuno sembra menzionare una delle opzioni più semplici: Google Toolbox per Mac
(Nonostante il nome, funziona anche su iOS.)

https://github.com/google/google-toolbox-for-mac/blob/master/Foundation/GTMNSString%2BHTML.h

/// Get a string where internal characters that are escaped for HTML are unescaped 
//
///  For example, '&amp;' becomes '&'
///  Handles &#32; and &#x32; cases as well
///
//  Returns:
//    Autoreleased NSString
//
- (NSString *)gtm_stringByUnescapingFromHTML;

E ho dovuto includere solo tre file nel progetto: header, implementation e GTMDefines.h .


Quello di Daniel è fondamentalmente molto carino, e ho risolto alcuni problemi lì:

  1. rimosso il carattere di salto per NSSCanner (altrimenti gli spazi tra due entità continue verrebbero ignorati

    [scanner setCharactersToBeSkipped: nil];

  2. corretto l'analisi quando ci sono simboli '&' isolati (non sono sicuro di quale sia l'output 'corretto' per questo, l'ho appena confrontato con firefox):

per esempio

    &#ABC DF & B&#39;  & C&#39; Items (288)

ecco il codice modificato:

- (NSString *)stringByDecodingXMLEntities {
    NSUInteger myLength = [self length];
    NSUInteger ampIndex = [self rangeOfString:@"&" options:NSLiteralSearch].location;

    // Short-circuit if there are no ampersands.
    if (ampIndex == NSNotFound) {
        return self;
    }
    // Make result string with some extra capacity.
    NSMutableString *result = [NSMutableString stringWithCapacity:(myLength * 1.25)];

    // First iteration doesn't need to scan to & since we did that already, but for code simplicity's sake we'll do it again with the scanner.
    NSScanner *scanner = [NSScanner scannerWithString:self];

    [scanner setCharactersToBeSkipped:nil];

    NSCharacterSet *boundaryCharacterSet = [NSCharacterSet characterSetWithCharactersInString:@" \t\n\r;"];

    do {
        // Scan up to the next entity or the end of the string.
        NSString *nonEntityString;
        if ([scanner scanUpToString:@"&" intoString:&nonEntityString]) {
            [result appendString:nonEntityString];
        }
        if ([scanner isAtEnd]) {
            goto finish;
        }
        // Scan either a HTML or numeric character entity reference.
        if ([scanner scanString:@"&amp;" intoString:NULL])
            [result appendString:@"&"];
        else if ([scanner scanString:@"&apos;" intoString:NULL])
            [result appendString:@"'"];
        else if ([scanner scanString:@"&quot;" intoString:NULL])
            [result appendString:@"\""];
        else if ([scanner scanString:@"&lt;" intoString:NULL])
            [result appendString:@"<"];
        else if ([scanner scanString:@"&gt;" intoString:NULL])
            [result appendString:@">"];
        else if ([scanner scanString:@"&#" intoString:NULL]) {
            BOOL gotNumber;
            unsigned charCode;
            NSString *xForHex = @"";

            // Is it hex or decimal?
            if ([scanner scanString:@"x" intoString:&xForHex]) {
                gotNumber = [scanner scanHexInt:&charCode];
            }
            else {
                gotNumber = [scanner scanInt:(int*)&charCode];
            }

            if (gotNumber) {
                [result appendFormat:@"%C", (unichar)charCode];

                [scanner scanString:@";" intoString:NULL];
            }
            else {
                NSString *unknownEntity = @"";

                [scanner scanUpToCharactersFromSet:boundaryCharacterSet intoString:&unknownEntity];


                [result appendFormat:@"&#%@%@", xForHex, unknownEntity];

                //[scanner scanUpToString:@";" intoString:&unknownEntity];
                //[result appendFormat:@"&#%@%@;", xForHex, unknownEntity];
                NSLog(@"Expected numeric character entity but got &#%@%@;", xForHex, unknownEntity);

            }

        }
        else {
            NSString *amp;

            [scanner scanString:@"&" intoString:&amp];  //an isolated & symbol
            [result appendString:amp];

            /*
            NSString *unknownEntity = @"";
            [scanner scanUpToString:@";" intoString:&unknownEntity];
            NSString *semicolon = @"";
            [scanner scanString:@";" intoString:&semicolon];
            [result appendFormat:@"%@%@", unknownEntity, semicolon];
            NSLog(@"Unsupported XML character entity %@%@", unknownEntity, semicolon);
             */
        }

    }
    while (![scanner isAtEnd]);

finish:
    return result;
}

Questo è il modo in cui lo faccio utilizzando il framework RegexKitLite :

-(NSString*) decodeHtmlUnicodeCharacters: (NSString*) html {
NSString* result = [html copy];
NSArray* matches = [result arrayOfCaptureComponentsMatchedByRegex: @"\\&#([\\d]+);"];

if (![matches count]) 
    return result;

for (int i=0; i<[matches count]; i++) {
    NSArray* array = [matches objectAtIndex: i];
    NSString* charCode = [array objectAtIndex: 1];
    int code = [charCode intValue];
    NSString* character = [NSString stringWithFormat:@"%C", code];
    result = [result stringByReplacingOccurrencesOfString: [array objectAtIndex: 0]
                                               withString: character];      
}   
return result;  

}

Spero che questo aiuti qualcuno.


Se si ha il riferimento di entità di carattere come una stringa, ad esempio @"2318" , è possibile estrarre una NSString ricodificata con il carattere Unicode corretto utilizzando strtoul ;

NSString *unicodePoint = @"2318"
unichar iconChar = (unichar) strtoul(unicodePoint.UTF8String, NULL, 16);
NSString *recoded = [NSString stringWithFormat:@"%C", iconChar];
NSLog(@"recoded: %@", recoded");
// prints out "recoded: ⌘"

Versione Swift 3 della risposta di Jugale

extension String {
    static private let mappings = ["&quot;" : "\"","&amp;" : "&", "&lt;" : "<", "&gt;" : ">","&nbsp;" : " ","&iexcl;" : "¡","&cent;" : "¢","&pound;" : " £","&curren;" : "¤","&yen;" : "¥","&brvbar;" : "¦","&sect;" : "§","&uml;" : "¨","&copy;" : "©","&ordf;" : " ª","&laquo" : "«","&not" : "¬","&reg" : "®","&macr" : "¯","&deg" : "°","&plusmn" : "±","&sup2; " : "²","&sup3" : "³","&acute" : "´","&micro" : "µ","&para" : "¶","&middot" : "·","&cedil" : "¸","&sup1" : "¹","&ordm" : "º","&raquo" : "»&","frac14" : "¼","&frac12" : "½","&frac34" : "¾","&iquest" : "¿","&times" : "×","&divide" : "÷","&ETH" : "Ð","&eth" : "ð","&THORN" : "Þ","&thorn" : "þ","&AElig" : "Æ","&aelig" : "æ","&OElig" : "Œ","&oelig" : "œ","&Aring" : "Å","&Oslash" : "Ø","&Ccedil" : "Ç","&ccedil" : "ç","&szlig" : "ß","&Ntilde;" : "Ñ","&ntilde;":"ñ",]

    func stringByDecodingXMLEntities() -> String {

        guard let _ = self.range(of: "&", options: [.literal]) else {
            return self
        }

        var result = ""

        let scanner = Scanner(string: self)
        scanner.charactersToBeSkipped = nil

        let boundaryCharacterSet = CharacterSet(charactersIn: " \t\n\r;")

        repeat {
            var nonEntityString: NSString? = nil

            if scanner.scanUpTo("&", into: &nonEntityString) {
                if let s = nonEntityString as? String {
                    result.append(s)
                }
            }

            if scanner.isAtEnd {
                break
            }

            var didBreak = false
            for (k,v) in String.mappings {
                if scanner.scanString(k, into: nil) {
                    result.append(v)
                    didBreak = true
                    break
                }
            }

            if !didBreak {

                if scanner.scanString("&#", into: nil) {

                    var gotNumber = false
                    var charCodeUInt: UInt32 = 0
                    var charCodeInt: Int32 = -1
                    var xForHex: NSString? = nil

                    if scanner.scanString("x", into: &xForHex) {
                        gotNumber = scanner.scanHexInt32(&charCodeUInt)
                    }
                    else {
                        gotNumber = scanner.scanInt32(&charCodeInt)
                    }

                    if gotNumber {
                        let newChar = String(format: "%C", (charCodeInt > -1) ? charCodeInt : charCodeUInt)
                        result.append(newChar)
                        scanner.scanString(";", into: nil)
                    }
                    else {
                        var unknownEntity: NSString? = nil
                        scanner.scanUpToCharacters(from: boundaryCharacterSet, into: &unknownEntity)
                        let h = xForHex ?? ""
                        let u = unknownEntity ?? ""
                        result.append("&#\(h)\(u)")
                    }
                }
                else {
                    scanner.scanString("&", into: nil)
                    result.append("&")
                }
            }

        } while (!scanner.isAtEnd)

        return result
    }
}




cocoa-touch