skip to Main Content

I am using some XS modules that are expecting latin1 string data (and ignoring perl’s UTF8 flag). In some cases, I am passing the result of JSON decoding, which should only include latin1 characters, but in some cases has them escaped (e.g. ["cou00f6perative"]).

Is there a JSON decoding module that offers an option to return strings downgraded (at least where possible)? I’m not finding such an option in JSON, JSON::XS, or Cpanel::JSON::XS.

use strict;
use warnings;
use Cpanel::JSON::XS;
use Devel::Peek;
my $got = Cpanel::JSON::XS->new->decode('["cou00f6perative"]')->[0];
Dump $got;
my $wanted = $got;
utf8::downgrade($wanted);
Dump $wanted;

output:

SV = PV(0xd6cbf0) at 0xd8a460
  REFCNT = 1
  FLAGS = (POK,IsCOW,pPOK,UTF8)
  PV = 0xd83b40 "co303266perative" [UTF8 "cox{f6}perative"]
  CUR = 12
  LEN = 14
  COW_REFCNT = 0
SV = PV(0xd6cb20) at 0xd977f0
  REFCNT = 1
  FLAGS = (POK,pPOK)
  PV = 0xe0d120 "co366perative"
  CUR = 11
  LEN = 14

2

Answers


  1. The safest approach is to fix the data structure after the fact.

    # The following apply to each of decode_struct_inplace, encode_struct_inplace, downgrade_struct_inplace and upgrade_struct_inplace:
    # - Errors are silently ignored. The scalar is left unchanged.
    # - Recognizes references to arrays, hashes and scalars. More esoteric references won't processed, and a warning will be issued.
    # - Overloaded objects and magical variables are not supported. They may induce incorrect behaviour.
    # - The structure is changed in-place. You can use Storable::dclone to make a copy first if need be.
    # - For convenience, returns its argument.
    
    # Decodes all strings in a data structure from UTF-8 to Unicode Code Points.
    sub decode_struct_inplace { _convert_struct_inplace($_[0], &utf8::decode) }
    
    # Encodes all strings in a data structure from Unicode Code Points to UTF-8.
    sub encode_struct_inplace { _convert_struct_inplace($_[0], &utf8::encode) }
    
    # "Downgrades" the string storage format of all scalars containing strings in
    # a data structure to the UTF8=0 format if they aren't already in that format.
    sub downgrade_struct_inplace { _convert_struct_inplace($_[0], &utf8::downgrade) }
    
    # "Upgrades" the string storage format of all scalars containing strings in
    # a data structure to the UTF8=1 format if they aren't already in that format.
    sub upgrade_struct_inplace { _convert_struct_inplace($_[0], &utf8::upgrade) }
    

    sub _convert_struct_inplace {
        # Make $arg an alias to $_[0]. Changes to $arg (like changes to $_[0]) will be reflected in the parent.
        our $arg; local *arg = shift;
        my $converter        =  shift;
    
        my $caller = (caller(1))[3];
        $caller =~ s/^.*:://;    # /
    
        my %seen;    # Only decode each variable once.
        my %warned;  # Only emit each warning once.
    
        # Using "my" would introduce a memory cycle we'd have to work to break to avoid a memory leak.
        local *_visitor = sub {
            # Make $arg an alias to $_[0]. Changes to $arg (like changes to $_[0]) will be reflected in the parent.
            our $arg; local *arg = $_[0];
    
            # Don't decode the same variable twice.
            # Also detects referential loops.
            return $arg if $seen{refaddr($arg)}++;
    
            my $reftype = reftype($arg);
            if (!defined($reftype)) {
                if (defined($arg)) {
                    my $sv = B::svref_2object($arg);  # Meta object.
                    if ($sv->isa('B::PV') && ($sv->FLAGS & B::SVf_POK)) {  # Can it contain a string? And does it?
                        $converter->($arg);
                    }
                }
            }
            elsif ($reftype eq 'ARRAY') {
                _visitor($_) for @$arg;
            }
            elsif ($reftype eq 'HASH') {
                # Usually, we can avoid converting the keys.
                my $ascii = 1;
                for (keys(%$arg)) {
                    if (/[^x00-x7F]/) {
                        $ascii = 0;
                        last;
                    }
                }
    
                if (!$ascii) {
                    %$arg = map {
                            $converter->( my $new_key = $_ );
                            $new_key => $arg->{$_}
                        } keys(%$arg);
                }
    
                _visitor($_) for values(%$arg);
            }
            elsif ($reftype eq 'SCALAR') {
                _visitor($$arg);
            }
            elsif ($reftype eq 'REF') {
                _visitor($$arg);
            }
            else {
                warn("Reference type $reftype not supported by $callern")
                    if !$warned{$reftype}++;
            }
    
            return $arg;
        };
    
        return _visitor($arg);
    }
    

    This is existing code that can be simplified a little since it handles things not present in data structures created by JSON modules.

    Login or Signup to reply.
  2. You could monkey-patch JSON::PP to produce the desired effect.

    use JSON::PP qw( );
    
    use vars qw( $JSON_PP_DOWNGRADE );
    
    BEGIN {
       $JSON_PP_DOWNGRADE //= 0;
       my $old_string = &JSON::PP::string;
       my $new_string = sub {
          my $s = $old_string->(@_);
          utf8::downgrade($s) if $JSON_PP_DOWNGRADE;
          $s
       };
    
       no warnings qw ( redefine );
       *JSON::PP::string = $new_string;
    }
    

    Where you want JSON::PP to produce a “downgraded structure”, add the following before the call to decode:

    local $JSON_PP_DOWNGRADE = 1;
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search