1 /* Coding system handler (conversion, detection, etc).
    2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
    3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
    4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
    5      2005, 2006, 2007, 2008, 2009, 2010
    6      National Institute of Advanced Industrial Science and Technology (AIST)
    7      Registration Number H14PRO021
    8    Copyright (C) 2003
    9      National Institute of Advanced Industrial Science and Technology (AIST)
   10      Registration Number H13PRO009
   11 
   12 This file is part of GNU Emacs.
   13 
   14 GNU Emacs is free software: you can redistribute it and/or modify
   15 it under the terms of the GNU General Public License as published by
   16 the Free Software Foundation, either version 3 of the License, or
   17 (at your option) any later version.
   18 
   19 GNU Emacs is distributed in the hope that it will be useful,
   20 but WITHOUT ANY WARRANTY; without even the implied warranty of
   21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   22 GNU General Public License for more details.
   23 
   24 You should have received a copy of the GNU General Public License
   25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
   26 
   27 /*** TABLE OF CONTENTS ***
   28 
   29   0. General comments
   30   1. Preamble
   31   2. Emacs' internal format (emacs-utf-8) handlers
   32   3. UTF-8 handlers
   33   4. UTF-16 handlers
   34   5. Charset-base coding systems handlers
   35   6. emacs-mule (old Emacs' internal format) handlers
   36   7. ISO2022 handlers
   37   8. Shift-JIS and BIG5 handlers
   38   9. CCL handlers
   39   10. C library functions
   40   11. Emacs Lisp library functions
   41   12. Postamble
   42 
   43 */
   44 
   45 /*** 0. General comments ***
   46 
   47 
   48 CODING SYSTEM
   49 
   50   A coding system is an object for an encoding mechanism that contains
   51   information about how to convert byte sequences to character
   52   sequences and vice versa.  When we say "decode", it means converting
   53   a byte sequence of a specific coding system into a character
   54   sequence that is represented by Emacs' internal coding system
   55   `emacs-utf-8', and when we say "encode", it means converting a
   56   character sequence of emacs-utf-8 to a byte sequence of a specific
   57   coding system.
   58 
   59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
   60   C level, a coding system is represented by a vector of attributes
   61   stored in the hash table Vcharset_hash_table.  The conversion from
   62   coding system symbol to attributes vector is done by looking up
   63   Vcharset_hash_table by the symbol.
   64 
   65   Coding systems are classified into the following types depending on
   66   the encoding mechanism.  Here's a brief description of the types.
   67 
   68   o UTF-8
   69 
   70   o UTF-16
   71 
   72   o Charset-base coding system
   73 
   74   A coding system defined by one or more (coded) character sets.
   75   Decoding and encoding are done by a code converter defined for each
   76   character set.
   77 
   78   o Old Emacs internal format (emacs-mule)
   79 
   80   The coding system adopted by old versions of Emacs (20 and 21).
   81 
   82   o ISO2022-base coding system
   83 
   84   The most famous coding system for multiple character sets.  X's
   85   Compound Text, various EUCs (Extended Unix Code), and coding systems
   86   used in the Internet communication such as ISO-2022-JP are all
   87   variants of ISO2022.
   88 
   89   o SJIS (or Shift-JIS or MS-Kanji-Code)
   90 
   91   A coding system to encode character sets: ASCII, JISX0201, and
   92   JISX0208.  Widely used for PC's in Japan.  Details are described in
   93   section 8.
   94 
   95   o BIG5
   96 
   97   A coding system to encode character sets: ASCII and Big5.  Widely
   98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
   99   described in section 8.  In this file, when we write "big5" (all
  100   lowercase), we mean the coding system, and when we write "Big5"
  101   (capitalized), we mean the character set.
  102 
  103   o CCL
  104 
  105   If a user wants to decode/encode text encoded in a coding system
  106   not listed above, he can supply a decoder and an encoder for it in
  107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
  108   program while decoding/encoding.
  109 
  110   o Raw-text
  111 
  112   A coding system for text containing raw eight-bit data.  Emacs
  113   treats each byte of source text as a character (except for
  114   end-of-line conversion).
  115 
  116   o No-conversion
  117 
  118   Like raw text, but don't do end-of-line conversion.
  119 
  120 
  121 END-OF-LINE FORMAT
  122 
  123   How text end-of-line is encoded depends on operating system.  For
  124   instance, Unix's format is just one byte of LF (line-feed) code,
  125   whereas DOS's format is two-byte sequence of `carriage-return' and
  126   `line-feed' codes.  MacOS's format is usually one byte of
  127   `carriage-return'.
  128 
  129   Since text character encoding and end-of-line encoding are
  130   independent, any coding system described above can take any format
  131   of end-of-line (except for no-conversion).
  132 
  133 STRUCT CODING_SYSTEM
  134 
  135   Before using a coding system for code conversion (i.e. decoding and
  136   encoding), we setup a structure of type `struct coding_system'.
  137   This structure keeps various information about a specific code
  138   conversion (e.g. the location of source and destination data).
  139 
  140 */
  141 
  142 /* COMMON MACROS */
  143 
  144 
  145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  146 
  147   These functions check if a byte sequence specified as a source in
  148   CODING conforms to the format of XXX, and update the members of
  149   DETECT_INFO.
  150 
  151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
  152 
  153   Below is the template of these functions.  */
  154 
  155 #if 0
  156 static int
  157 detect_coding_XXX (coding, detect_info)
  158      struct coding_system *coding;
  159      struct coding_detection_info *detect_info;
  160 {
  161   const unsigned char *src = coding->source;
  162   const unsigned char *src_end = coding->source + coding->src_bytes;
  163   int multibytep = coding->src_multibyte;
  164   int consumed_chars = 0;
  165   int found = 0;
  166   ...;
  167 
  168   while (1)
  169     {
  170       /* Get one byte from the source.  If the souce is exausted, jump
  171          to no_more_source:.  */
  172       ONE_MORE_BYTE (c);
  173 
  174       if (! __C_conforms_to_XXX___ (c))
  175         break;
  176       if (! __C_strongly_suggests_XXX__ (c))
  177         found = CATEGORY_MASK_XXX;
  178     }
  179   /* The byte sequence is invalid for XXX.  */
  180   detect_info->rejected |= CATEGORY_MASK_XXX;
  181   return 0;
  182 
  183  no_more_source:
  184   /* The source exausted successfully.  */
  185   detect_info->found |= found;
  186   return 1;
  187 }
  188 #endif
  189 
  190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
  191 
  192   These functions decode a byte sequence specified as a source by
  193   CODING.  The resulting multibyte text goes to a place pointed to by
  194   CODING->charbuf, the length of which should not exceed
  195   CODING->charbuf_size;
  196 
  197   These functions set the information of original and decoded texts in
  198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
  199   They also set CODING->result to one of CODING_RESULT_XXX indicating
  200   how the decoding is finished.
  201 
  202   Below is the template of these functions.  */
  203 
  204 #if 0
  205 static void
  206 decode_coding_XXXX (coding)
  207      struct coding_system *coding;
  208 {
  209   const unsigned char *src = coding->source + coding->consumed;
  210   const unsigned char *src_end = coding->source + coding->src_bytes;
  211   /* SRC_BASE remembers the start position in source in each loop.
  212      The loop will be exited when there's not enough source code, or
  213      when there's no room in CHARBUF for a decoded character.  */
  214   const unsigned char *src_base;
  215   /* A buffer to produce decoded characters.  */
  216   int *charbuf = coding->charbuf + coding->charbuf_used;
  217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
  218   int multibytep = coding->src_multibyte;
  219 
  220   while (1)
  221     {
  222       src_base = src;
  223       if (charbuf < charbuf_end)
  224         /* No more room to produce a decoded character.  */
  225         break;
  226       ONE_MORE_BYTE (c);
  227       /* Decode it. */
  228     }
  229 
  230  no_more_source:
  231   if (src_base < src_end
  232       && coding->mode & CODING_MODE_LAST_BLOCK)
  233     /* If the source ends by partial bytes to construct a character,
  234        treat them as eight-bit raw data.  */
  235     while (src_base < src_end && charbuf < charbuf_end)
  236       *charbuf++ = *src_base++;
  237   /* Remember how many bytes and characters we consumed.  If the
  238      source is multibyte, the bytes and chars are not identical.  */
  239   coding->consumed = coding->consumed_char = src_base - coding->source;
  240   /* Remember how many characters we produced.  */
  241   coding->charbuf_used = charbuf - coding->charbuf;
  242 }
  243 #endif
  244 
  245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
  246 
  247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
  248   internal multibyte format by CODING.  The resulting byte sequence
  249   goes to a place pointed to by DESTINATION, the length of which
  250   should not exceed DST_BYTES.
  251 
  252   These functions set the information of original and encoded texts in
  253   the members produced, produced_char, consumed, and consumed_char of
  254   the structure *CODING.  They also set the member result to one of
  255   CODING_RESULT_XXX indicating how the encoding finished.
  256 
  257   DST_BYTES zero means that source area and destination area are
  258   overlapped, which means that we can produce a encoded text until it
  259   reaches at the head of not-yet-encoded source text.
  260 
  261   Below is a template of these functions.  */
  262 #if 0
  263 static void
  264 encode_coding_XXX (coding)
  265      struct coding_system *coding;
  266 {
  267   int multibytep = coding->dst_multibyte;
  268   int *charbuf = coding->charbuf;
  269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
  270   unsigned char *dst = coding->destination + coding->produced;
  271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
  273   int produced_chars = 0;
  274 
  275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
  276     {
  277       int c = *charbuf;
  278       /* Encode C into DST, and increment DST.  */
  279     }
  280  label_no_more_destination:
  281   /* How many chars and bytes we produced.  */
  282   coding->produced_char += produced_chars;
  283   coding->produced = dst - coding->destination;
  284 }
  285 #endif
  286 
  287 
  288 /*** 1. Preamble ***/
  289 
  290 #include <config.h>
  291 #include <stdio.h>
  292 #include <setjmp.h>
  293 
  294 #include "lisp.h"
  295 #include "buffer.h"
  296 #include "character.h"
  297 #include "charset.h"
  298 #include "ccl.h"
  299 #include "composite.h"
  300 #include "coding.h"
  301 #include "window.h"
  302 #include "frame.h"
  303 #include "termhooks.h"
  304 
  305 Lisp_Object Vcoding_system_hash_table;
  306 
  307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
  308 Lisp_Object Qunix, Qdos;
  309 extern Lisp_Object Qmac;        /* frame.c */
  310 Lisp_Object Qbuffer_file_coding_system;
  311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
  312 Lisp_Object Qdefault_char;
  313 Lisp_Object Qno_conversion, Qundecided;
  314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
  315 Lisp_Object Qbig, Qlittle;
  316 Lisp_Object Qcoding_system_history;
  317 Lisp_Object Qvalid_codes;
  318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
  319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
  320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
  321 Lisp_Object QCascii_compatible_p;
  322 
  323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
  324 Lisp_Object Qcall_process, Qcall_process_region;
  325 Lisp_Object Qstart_process, Qopen_network_stream;
  326 Lisp_Object Qtarget_idx;
  327 
  328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
  329 Lisp_Object Qinterrupted, Qinsufficient_memory;
  330 
  331 extern Lisp_Object Qcompletion_ignore_case;
  332 
  333 /* If a symbol has this property, evaluate the value to define the
  334    symbol as a coding system.  */
  335 static Lisp_Object Qcoding_system_define_form;
  336 
  337 int coding_system_require_warning;
  338 
  339 Lisp_Object Vselect_safe_coding_system_function;
  340 
  341 /* Mnemonic string for each format of end-of-line.  */
  342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
  343 /* Mnemonic string to indicate format of end-of-line is not yet
  344    decided.  */
  345 Lisp_Object eol_mnemonic_undecided;
  346 
  347 /* Format of end-of-line decided by system.  This is Qunix on
  348    Unix and Mac, Qdos on DOS/Windows.
  349    This has an effect only for external encoding (i.e. for output to
  350    file and process), not for in-buffer or Lisp string encoding.  */
  351 static Lisp_Object system_eol_type;
  352 
  353 #ifdef emacs
  354 
  355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
  356 
  357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
  358 
  359 /* Coding system emacs-mule and raw-text are for converting only
  360    end-of-line format.  */
  361 Lisp_Object Qemacs_mule, Qraw_text;
  362 Lisp_Object Qutf_8_emacs;
  363 
  364 /* Coding-systems are handed between Emacs Lisp programs and C internal
  365    routines by the following three variables.  */
  366 /* Coding-system for reading files and receiving data from process.  */
  367 Lisp_Object Vcoding_system_for_read;
  368 /* Coding-system for writing files and sending data to process.  */
  369 Lisp_Object Vcoding_system_for_write;
  370 /* Coding-system actually used in the latest I/O.  */
  371 Lisp_Object Vlast_coding_system_used;
  372 /* Set to non-nil when an error is detected while code conversion.  */
  373 Lisp_Object Vlast_code_conversion_error;
  374 /* A vector of length 256 which contains information about special
  375    Latin codes (especially for dealing with Microsoft codes).  */
  376 Lisp_Object Vlatin_extra_code_table;
  377 
  378 /* Flag to inhibit code conversion of end-of-line format.  */
  379 int inhibit_eol_conversion;
  380 
  381 /* Flag to inhibit ISO2022 escape sequence detection.  */
  382 int inhibit_iso_escape_detection;
  383 
  384 /* Flag to inhibit detection of binary files through null bytes.  */
  385 int inhibit_null_byte_detection;
  386 
  387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
  388 int inherit_process_coding_system;
  389 
  390 /* Coding system to be used to encode text for terminal display when
  391    terminal coding system is nil.  */
  392 struct coding_system safe_terminal_coding;
  393 
  394 Lisp_Object Vfile_coding_system_alist;
  395 Lisp_Object Vprocess_coding_system_alist;
  396 Lisp_Object Vnetwork_coding_system_alist;
  397 
  398 Lisp_Object Vlocale_coding_system;
  399 
  400 #endif /* emacs */
  401 
  402 /* Flag to tell if we look up translation table on character code
  403    conversion.  */
  404 Lisp_Object Venable_character_translation;
  405 /* Standard translation table to look up on decoding (reading).  */
  406 Lisp_Object Vstandard_translation_table_for_decode;
  407 /* Standard translation table to look up on encoding (writing).  */
  408 Lisp_Object Vstandard_translation_table_for_encode;
  409 
  410 Lisp_Object Qtranslation_table;
  411 Lisp_Object Qtranslation_table_id;
  412 Lisp_Object Qtranslation_table_for_decode;
  413 Lisp_Object Qtranslation_table_for_encode;
  414 
  415 /* Alist of charsets vs revision number.  */
  416 static Lisp_Object Vcharset_revision_table;
  417 
  418 /* Default coding systems used for process I/O.  */
  419 Lisp_Object Vdefault_process_coding_system;
  420 
  421 /* Char table for translating Quail and self-inserting input.  */
  422 Lisp_Object Vtranslation_table_for_input;
  423 
  424 /* Two special coding systems.  */
  425 Lisp_Object Vsjis_coding_system;
  426 Lisp_Object Vbig5_coding_system;
  427 
  428 /* ISO2022 section */
  429 
  430 #define CODING_ISO_INITIAL(coding, reg)                 \
  431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
  432                      coding_attr_iso_initial),          \
  433                reg)))
  434 
  435 
  436 #define CODING_ISO_REQUEST(coding, charset_id)          \
  437   (((charset_id) <= (coding)->max_charset_id            \
  438     ? ((coding)->safe_charsets[charset_id] != 255       \
  439        ? (coding)->safe_charsets[charset_id]            \
  440        : -1)                                            \
  441     : -1))
  442 
  443 
  444 #define CODING_ISO_FLAGS(coding)        \
  445   ((coding)->spec.iso_2022.flags)
  446 #define CODING_ISO_DESIGNATION(coding, reg)     \
  447   ((coding)->spec.iso_2022.current_designation[reg])
  448 #define CODING_ISO_INVOCATION(coding, plane)    \
  449   ((coding)->spec.iso_2022.current_invocation[plane])
  450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
  451   ((coding)->spec.iso_2022.single_shifting)
  452 #define CODING_ISO_BOL(coding)  \
  453   ((coding)->spec.iso_2022.bol)
  454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
  455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
  456 #define CODING_ISO_CMP_STATUS(coding)   \
  457   (&(coding)->spec.iso_2022.cmp_status)
  458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
  459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
  460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
  461   ((coding)->spec.iso_2022.embedded_utf_8)
  462 
  463 /* Control characters of ISO2022.  */
  464                         /* code */      /* function */
  465 #define ISO_CODE_LF     0x0A            /* line-feed */
  466 #define ISO_CODE_CR     0x0D            /* carriage-return */
  467 #define ISO_CODE_SO     0x0E            /* shift-out */
  468 #define ISO_CODE_SI     0x0F            /* shift-in */
  469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
  470 #define ISO_CODE_ESC    0x1B            /* escape */
  471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
  472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
  473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
  474 
  475 /* All code (1-byte) of ISO2022 is classified into one of the
  476    followings.  */
  477 enum iso_code_class_type
  478   {
  479     ISO_control_0,              /* Control codes in the range
  480                                    0x00..0x1F and 0x7F, except for the
  481                                    following 5 codes.  */
  482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
  483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
  484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
  485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
  486     ISO_control_1,              /* Control codes in the range
  487                                    0x80..0x9F, except for the
  488                                    following 3 codes.  */
  489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
  490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
  491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
  492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
  493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
  494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
  495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
  496   };
  497 
  498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
  499     `iso-flags' attribute of an iso2022 coding system.  */
  500 
  501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
  502    instead of the correct short-form sequence (e.g. ESC $ A).  */
  503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
  504 
  505 /* If set, reset graphic planes and registers at end-of-line to the
  506    initial state.  */
  507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
  508 
  509 /* If set, reset graphic planes and registers before any control
  510    characters to the initial state.  */
  511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
  512 
  513 /* If set, encode by 7-bit environment.  */
  514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
  515 
  516 /* If set, use locking-shift function.  */
  517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
  518 
  519 /* If set, use single-shift function.  Overwrite
  520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
  521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
  522 
  523 /* If set, use designation escape sequence.  */
  524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
  525 
  526 /* If set, produce revision number sequence.  */
  527 #define CODING_ISO_FLAG_REVISION        0x0080
  528 
  529 /* If set, produce ISO6429's direction specifying sequence.  */
  530 #define CODING_ISO_FLAG_DIRECTION       0x0100
  531 
  532 /* If set, assume designation states are reset at beginning of line on
  533    output.  */
  534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
  535 
  536 /* If set, designation sequence should be placed at beginning of line
  537    on output.  */
  538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
  539 
  540 /* If set, do not encode unsafe charactes on output.  */
  541 #define CODING_ISO_FLAG_SAFE            0x0800
  542 
  543 /* If set, extra latin codes (128..159) are accepted as a valid code
  544    on input.  */
  545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
  546 
  547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
  548 
  549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
  550 
  551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
  552 
  553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
  554 
  555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
  556 
  557 /* A character to be produced on output if encoding of the original
  558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
  559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
  560 
  561 /* UTF-8 section */
  562 #define CODING_UTF_8_BOM(coding)        \
  563   ((coding)->spec.utf_8_bom)
  564 
  565 /* UTF-16 section */
  566 #define CODING_UTF_16_BOM(coding)       \
  567   ((coding)->spec.utf_16.bom)
  568 
  569 #define CODING_UTF_16_ENDIAN(coding)    \
  570   ((coding)->spec.utf_16.endian)
  571 
  572 #define CODING_UTF_16_SURROGATE(coding) \
  573   ((coding)->spec.utf_16.surrogate)
  574 
  575 
  576 /* CCL section */
  577 #define CODING_CCL_DECODER(coding)      \
  578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
  579 #define CODING_CCL_ENCODER(coding)      \
  580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
  581 #define CODING_CCL_VALIDS(coding)                                          \
  582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
  583 
  584 /* Index for each coding category in `coding_categories' */
  585 
  586 enum coding_category
  587   {
  588     coding_category_iso_7,
  589     coding_category_iso_7_tight,
  590     coding_category_iso_8_1,
  591     coding_category_iso_8_2,
  592     coding_category_iso_7_else,
  593     coding_category_iso_8_else,
  594     coding_category_utf_8_auto,
  595     coding_category_utf_8_nosig,
  596     coding_category_utf_8_sig,
  597     coding_category_utf_16_auto,
  598     coding_category_utf_16_be,
  599     coding_category_utf_16_le,
  600     coding_category_utf_16_be_nosig,
  601     coding_category_utf_16_le_nosig,
  602     coding_category_charset,
  603     coding_category_sjis,
  604     coding_category_big5,
  605     coding_category_ccl,
  606     coding_category_emacs_mule,
  607     /* All above are targets of code detection.  */
  608     coding_category_raw_text,
  609     coding_category_undecided,
  610     coding_category_max
  611   };
  612 
  613 /* Definitions of flag bits used in detect_coding_XXXX.  */
  614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
  615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
  616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
  617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
  618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
  619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
  620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
  621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
  622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
  623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
  624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
  625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
  626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
  627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
  628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
  629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
  630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
  631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
  632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
  633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
  634 
  635 /* This value is returned if detect_coding_mask () find nothing other
  636    than ASCII characters.  */
  637 #define CATEGORY_MASK_ANY               \
  638   (CATEGORY_MASK_ISO_7                  \
  639    | CATEGORY_MASK_ISO_7_TIGHT          \
  640    | CATEGORY_MASK_ISO_8_1              \
  641    | CATEGORY_MASK_ISO_8_2              \
  642    | CATEGORY_MASK_ISO_7_ELSE           \
  643    | CATEGORY_MASK_ISO_8_ELSE           \
  644    | CATEGORY_MASK_UTF_8_AUTO           \
  645    | CATEGORY_MASK_UTF_8_NOSIG          \
  646    | CATEGORY_MASK_UTF_8_SIG            \
  647    | CATEGORY_MASK_UTF_16_AUTO          \
  648    | CATEGORY_MASK_UTF_16_BE            \
  649    | CATEGORY_MASK_UTF_16_LE            \
  650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
  651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
  652    | CATEGORY_MASK_CHARSET              \
  653    | CATEGORY_MASK_SJIS                 \
  654    | CATEGORY_MASK_BIG5                 \
  655    | CATEGORY_MASK_CCL                  \
  656    | CATEGORY_MASK_EMACS_MULE)
  657 
  658 
  659 #define CATEGORY_MASK_ISO_7BIT \
  660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
  661 
  662 #define CATEGORY_MASK_ISO_8BIT \
  663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
  664 
  665 #define CATEGORY_MASK_ISO_ELSE \
  666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
  667 
  668 #define CATEGORY_MASK_ISO_ESCAPE        \
  669   (CATEGORY_MASK_ISO_7                  \
  670    | CATEGORY_MASK_ISO_7_TIGHT          \
  671    | CATEGORY_MASK_ISO_7_ELSE           \
  672    | CATEGORY_MASK_ISO_8_ELSE)
  673 
  674 #define CATEGORY_MASK_ISO       \
  675   (  CATEGORY_MASK_ISO_7BIT     \
  676      | CATEGORY_MASK_ISO_8BIT   \
  677      | CATEGORY_MASK_ISO_ELSE)
  678 
  679 #define CATEGORY_MASK_UTF_16            \
  680   (CATEGORY_MASK_UTF_16_AUTO            \
  681    | CATEGORY_MASK_UTF_16_BE            \
  682    | CATEGORY_MASK_UTF_16_LE            \
  683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
  684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
  685 
  686 #define CATEGORY_MASK_UTF_8     \
  687   (CATEGORY_MASK_UTF_8_AUTO     \
  688    | CATEGORY_MASK_UTF_8_NOSIG  \
  689    | CATEGORY_MASK_UTF_8_SIG)
  690 
  691 /* List of symbols `coding-category-xxx' ordered by priority.  This
  692    variable is exposed to Emacs Lisp.  */
  693 static Lisp_Object Vcoding_category_list;
  694 
  695 /* Table of coding categories (Lisp symbols).  This variable is for
  696    internal use oly.  */
  697 static Lisp_Object Vcoding_category_table;
  698 
  699 /* Table of coding-categories ordered by priority.  */
  700 static enum coding_category coding_priorities[coding_category_max];
  701 
  702 /* Nth element is a coding context for the coding system bound to the
  703    Nth coding category.  */
  704 static struct coding_system coding_categories[coding_category_max];
  705 
  706 /*** Commonly used macros and functions ***/
  707 
  708 #ifndef min
  709 #define min(a, b) ((a) < (b) ? (a) : (b))
  710 #endif
  711 #ifndef max
  712 #define max(a, b) ((a) > (b) ? (a) : (b))
  713 #endif
  714 
  715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
  716   do {                                                  \
  717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
  718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
  719   } while (0)
  720 
  721 
  722 /* Safely get one byte from the source text pointed by SRC which ends
  723    at SRC_END, and set C to that byte.  If there are not enough bytes
  724    in the source, it jumps to `no_more_source'.  If multibytep is
  725    nonzero, and a multibyte character is found at SRC, set C to the
  726    negative value of the character code.  The caller should declare
  727    and set these variables appropriately in advance:
  728         src, src_end, multibytep */
  729 
  730 #define ONE_MORE_BYTE(c)                                \
  731   do {                                                  \
  732     if (src == src_end)                                 \
  733       {                                                 \
  734         if (src_base < src)                             \
  735           record_conversion_result                      \
  736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
  737         goto no_more_source;                            \
  738       }                                                 \
  739     c = *src++;                                         \
  740     if (multibytep && (c & 0x80))                       \
  741       {                                                 \
  742         if ((c & 0xFE) == 0xC0)                         \
  743           c = ((c & 1) << 6) | *src++;                  \
  744         else                                            \
  745           {                                             \
  746             src--;                                      \
  747             c = - string_char (src, &src, NULL);        \
  748             record_conversion_result                    \
  749               (coding, CODING_RESULT_INVALID_SRC);      \
  750           }                                             \
  751       }                                                 \
  752     consumed_chars++;                                   \
  753   } while (0)
  754 
  755 /* Safely get two bytes from the source text pointed by SRC which ends
  756    at SRC_END, and set C1 and C2 to those bytes while skipping the
  757    heading multibyte characters.  If there are not enough bytes in the
  758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
  759    a multibyte character is found for C2, set C2 to the negative value
  760    of the character code.  The caller should declare and set these
  761    variables appropriately in advance:
  762         src, src_end, multibytep
  763    It is intended that this macro is used in detect_coding_utf_16.  */
  764 
  765 #define TWO_MORE_BYTES(c1, c2)                          \
  766   do {                                                  \
  767     do {                                                \
  768       if (src == src_end)                               \
  769         goto no_more_source;                            \
  770       c1 = *src++;                                      \
  771       if (multibytep && (c1 & 0x80))                    \
  772         {                                               \
  773           if ((c1 & 0xFE) == 0xC0)                      \
  774             c1 = ((c1 & 1) << 6) | *src++;              \
  775           else                                          \
  776             {                                           \
  777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
  778               c1 = -1;                                  \
  779             }                                           \
  780         }                                               \
  781     } while (c1 < 0);                                   \
  782     if (src == src_end)                                 \
  783       goto no_more_source;                              \
  784     c2 = *src++;                                        \
  785     if (multibytep && (c2 & 0x80))                      \
  786       {                                                 \
  787         if ((c2 & 0xFE) == 0xC0)                        \
  788           c2 = ((c2 & 1) << 6) | *src++;                \
  789         else                                            \
  790           c2 = -1;                                      \
  791       }                                                 \
  792   } while (0)
  793 
  794 
  795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
  796   do {                                                  \
  797     c = *src++;                                         \
  798     if (multibytep && (c & 0x80))                       \
  799       {                                                 \
  800         if ((c & 0xFE) == 0xC0)                         \
  801           c = ((c & 1) << 6) | *src++;                  \
  802         else                                            \
  803           {                                             \
  804             src--;                                      \
  805             c = - string_char (src, &src, NULL);        \
  806             record_conversion_result                    \
  807               (coding, CODING_RESULT_INVALID_SRC);      \
  808           }                                             \
  809       }                                                 \
  810     consumed_chars++;                                   \
  811   } while (0)
  812 
  813 
  814 /* Store a byte C in the place pointed by DST and increment DST to the
  815    next free point, and increment PRODUCED_CHARS.  The caller should
  816    assure that C is 0..127, and declare and set the variable `dst'
  817    appropriately in advance.
  818 */
  819 
  820 
  821 #define EMIT_ONE_ASCII_BYTE(c)  \
  822   do {                          \
  823     produced_chars++;           \
  824     *dst++ = (c);               \
  825   } while (0)
  826 
  827 
  828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
  829 
  830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
  831   do {                                  \
  832     produced_chars += 2;                \
  833     *dst++ = (c1), *dst++ = (c2);       \
  834   } while (0)
  835 
  836 
  837 /* Store a byte C in the place pointed by DST and increment DST to the
  838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
  839    nonzero, store in an appropriate multibyte from.  The caller should
  840    declare and set the variables `dst' and `multibytep' appropriately
  841    in advance.  */
  842 
  843 #define EMIT_ONE_BYTE(c)                \
  844   do {                                  \
  845     produced_chars++;                   \
  846     if (multibytep)                     \
  847       {                                 \
  848         int ch = (c);                   \
  849         if (ch >= 0x80)                 \
  850           ch = BYTE8_TO_CHAR (ch);      \
  851         CHAR_STRING_ADVANCE (ch, dst);  \
  852       }                                 \
  853     else                                \
  854       *dst++ = (c);                     \
  855   } while (0)
  856 
  857 
  858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
  859 
  860 #define EMIT_TWO_BYTES(c1, c2)          \
  861   do {                                  \
  862     produced_chars += 2;                \
  863     if (multibytep)                     \
  864       {                                 \
  865         int ch;                         \
  866                                         \
  867         ch = (c1);                      \
  868         if (ch >= 0x80)                 \
  869           ch = BYTE8_TO_CHAR (ch);      \
  870         CHAR_STRING_ADVANCE (ch, dst);  \
  871         ch = (c2);                      \
  872         if (ch >= 0x80)                 \
  873           ch = BYTE8_TO_CHAR (ch);      \
  874         CHAR_STRING_ADVANCE (ch, dst);  \
  875       }                                 \
  876     else                                \
  877       {                                 \
  878         *dst++ = (c1);                  \
  879         *dst++ = (c2);                  \
  880       }                                 \
  881   } while (0)
  882 
  883 
  884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
  885   do {                                  \
  886     EMIT_ONE_BYTE (c1);                 \
  887     EMIT_TWO_BYTES (c2, c3);            \
  888   } while (0)
  889 
  890 
  891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
  892   do {                                          \
  893     EMIT_TWO_BYTES (c1, c2);                    \
  894     EMIT_TWO_BYTES (c3, c4);                    \
  895   } while (0)
  896 
  897 
  898 /* Prototypes for static functions.  */
  899 static void record_conversion_result P_ ((struct coding_system *coding,
  900                                           enum coding_result_code result));
  901 static int detect_coding_utf_8 P_ ((struct coding_system *,
  902                                     struct coding_detection_info *info));
  903 static void decode_coding_utf_8 P_ ((struct coding_system *));
  904 static int encode_coding_utf_8 P_ ((struct coding_system *));
  905 
  906 static int detect_coding_utf_16 P_ ((struct coding_system *,
  907                                      struct coding_detection_info *info));
  908 static void decode_coding_utf_16 P_ ((struct coding_system *));
  909 static int encode_coding_utf_16 P_ ((struct coding_system *));
  910 
  911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
  912                                        struct coding_detection_info *info));
  913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
  914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
  915 
  916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
  917                                          struct coding_detection_info *info));
  918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
  919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
  920 
  921 static int detect_coding_sjis P_ ((struct coding_system *,
  922                                    struct coding_detection_info *info));
  923 static void decode_coding_sjis P_ ((struct coding_system *));
  924 static int encode_coding_sjis P_ ((struct coding_system *));
  925 
  926 static int detect_coding_big5 P_ ((struct coding_system *,
  927                                    struct coding_detection_info *info));
  928 static void decode_coding_big5 P_ ((struct coding_system *));
  929 static int encode_coding_big5 P_ ((struct coding_system *));
  930 
  931 static int detect_coding_ccl P_ ((struct coding_system *,
  932                                   struct coding_detection_info *info));
  933 static void decode_coding_ccl P_ ((struct coding_system *));
  934 static int encode_coding_ccl P_ ((struct coding_system *));
  935 
  936 static void decode_coding_raw_text P_ ((struct coding_system *));
  937 static int encode_coding_raw_text P_ ((struct coding_system *));
  938 
  939 static void coding_set_source P_ ((struct coding_system *));
  940 static void coding_set_destination P_ ((struct coding_system *));
  941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
  942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
  943                                             EMACS_INT, EMACS_INT));
  944 static unsigned char *alloc_destination P_ ((struct coding_system *,
  945                                              EMACS_INT, unsigned char *));
  946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
  947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
  948                                                      int *, int *,
  949                                                      unsigned char *));
  950 static int detect_eol P_ ((const unsigned char *,
  951                            EMACS_INT, enum coding_category));
  952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
  953 static void decode_eol P_ ((struct coding_system *));
  954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
  955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
  956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
  957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
  958                                         EMACS_INT));
  959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
  960 static int decode_coding P_ ((struct coding_system *));
  961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
  962                                                       struct coding_system *,
  963                                                       int *, EMACS_INT *));
  964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
  965                                                   struct coding_system *,
  966                                                   int *, EMACS_INT *));
  967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
  968 static int encode_coding P_ ((struct coding_system *));
  969 static Lisp_Object make_conversion_work_buffer P_ ((int));
  970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
  971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
  972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
  973 
  974 static void
  975 record_conversion_result (struct coding_system *coding,
  976                           enum coding_result_code result)
  977 {
  978   coding->result = result;
  979   switch (result)
  980     {
  981     case CODING_RESULT_INSUFFICIENT_SRC:
  982       Vlast_code_conversion_error = Qinsufficient_source;
  983       break;
  984     case CODING_RESULT_INCONSISTENT_EOL:
  985       Vlast_code_conversion_error = Qinconsistent_eol;
  986       break;
  987     case CODING_RESULT_INVALID_SRC:
  988       Vlast_code_conversion_error = Qinvalid_source;
  989       break;
  990     case CODING_RESULT_INTERRUPT:
  991       Vlast_code_conversion_error = Qinterrupted;
  992       break;
  993     case CODING_RESULT_INSUFFICIENT_MEM:
  994       Vlast_code_conversion_error = Qinsufficient_memory;
  995       break;
  996     case CODING_RESULT_INSUFFICIENT_DST:
  997       /* Don't record this error in Vlast_code_conversion_error
  998          because it happens just temporarily and is resolved when the
  999          whole conversion is finished.  */
 1000       break;
 1001     case CODING_RESULT_SUCCESS:
 1002       break;
 1003     default:
 1004       Vlast_code_conversion_error = intern ("Unknown error");
 1005     }
 1006 }
 1007 
 1008 /* This wrapper macro is used to preserve validity of pointers into
 1009    buffer text across calls to decode_char, which could cause
 1010    relocation of buffers if it loads a charset map, because loading a
 1011    charset map allocates large structures.  */
 1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 1013   do {                                                                       \
 1014     charset_map_loaded = 0;                                                  \
 1015     c = DECODE_CHAR (charset, code);                                         \
 1016     if (charset_map_loaded)                                                  \
 1017       {                                                                      \
 1018         const unsigned char *orig = coding->source;                          \
 1019         EMACS_INT offset;                                                    \
 1020                                                                              \
 1021         coding_set_source (coding);                                          \
 1022         offset = coding->source - orig;                                      \
 1023         src += offset;                                                       \
 1024         src_base += offset;                                                  \
 1025         src_end += offset;                                                   \
 1026       }                                                                      \
 1027   } while (0)
 1028 
 1029 
 1030 /* If there are at least BYTES length of room at dst, allocate memory
 1031    for coding->destination and update dst and dst_end.  We don't have
 1032    to take care of coding->source which will be relocated.  It is
 1033    handled by calling coding_set_source in encode_coding.  */
 1034 
 1035 #define ASSURE_DESTINATION(bytes)                               \
 1036   do {                                                          \
 1037     if (dst + (bytes) >= dst_end)                               \
 1038       {                                                         \
 1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
 1040                                                                 \
 1041         dst = alloc_destination (coding, more_bytes, dst);      \
 1042         dst_end = coding->destination + coding->dst_bytes;      \
 1043       }                                                         \
 1044   } while (0)
 1045 
 1046 
 1047 /* Store multibyte form of the character C in P, and advance P to the
 1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 1049    never calls MAYBE_UNIFY_CHAR.  */
 1050 
 1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 1052   do {                                          \
 1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
 1054       *(p)++ = (c);                             \
 1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
 1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
 1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
 1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
 1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
 1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
 1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
 1063       *(p)++ = (0xF0 | (c >> 18)),              \
 1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 1066         *(p)++ = (0x80 | (c & 0x3F));           \
 1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
 1068       *(p)++ = 0xF8,                            \
 1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 1072         *(p)++ = (0x80 | (c & 0x3F));           \
 1073     else                                        \
 1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 1075   } while (0)
 1076 
 1077 
 1078 /* Return the character code of character whose multibyte form is at
 1079    P, and advance P to the end of the multibyte form.  This is like
 1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 1081 
 1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 1083   (!((p)[0] & 0x80)                                             \
 1084    ? *(p)++                                                     \
 1085    : ! ((p)[0] & 0x20)                                          \
 1086    ? ((p) += 2,                                                 \
 1087       ((((p)[-2] & 0x1F) << 6)                                  \
 1088        | ((p)[-1] & 0x3F)                                       \
 1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 1090    : ! ((p)[0] & 0x10)                                          \
 1091    ? ((p) += 3,                                                 \
 1092       ((((p)[-3] & 0x0F) << 12)                                 \
 1093        | (((p)[-2] & 0x3F) << 6)                                \
 1094        | ((p)[-1] & 0x3F)))                                     \
 1095    : ! ((p)[0] & 0x08)                                          \
 1096    ? ((p) += 4,                                                 \
 1097       ((((p)[-4] & 0xF) << 18)                                  \
 1098        | (((p)[-3] & 0x3F) << 12)                               \
 1099        | (((p)[-2] & 0x3F) << 6)                                \
 1100        | ((p)[-1] & 0x3F)))                                     \
 1101    : ((p) += 5,                                                 \
 1102       ((((p)[-4] & 0x3F) << 18)                                 \
 1103        | (((p)[-3] & 0x3F) << 12)                               \
 1104        | (((p)[-2] & 0x3F) << 6)                                \
 1105        | ((p)[-1] & 0x3F))))
 1106 
 1107 
 1108 static void
 1109 coding_set_source (coding)
 1110      struct coding_system *coding;
 1111 {
 1112   if (BUFFERP (coding->src_object))
 1113     {
 1114       struct buffer *buf = XBUFFER (coding->src_object);
 1115 
 1116       if (coding->src_pos < 0)
 1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 1118       else
 1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 1120     }
 1121   else if (STRINGP (coding->src_object))
 1122     {
 1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 1124     }
 1125   else
 1126     /* Otherwise, the source is C string and is never relocated
 1127        automatically.  Thus we don't have to update anything.  */
 1128     ;
 1129 }
 1130 
 1131 static void
 1132 coding_set_destination (coding)
 1133      struct coding_system *coding;
 1134 {
 1135   if (BUFFERP (coding->dst_object))
 1136     {
 1137       if (coding->src_pos < 0)
 1138         {
 1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 1140           coding->dst_bytes = (GAP_END_ADDR
 1141                                - (coding->src_bytes - coding->consumed)
 1142                                - coding->destination);
 1143         }
 1144       else
 1145         {
 1146           /* We are sure that coding->dst_pos_byte is before the gap
 1147              of the buffer. */
 1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 1149                                  + coding->dst_pos_byte - BEG_BYTE);
 1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 1151                                - coding->destination);
 1152         }
 1153     }
 1154   else
 1155     /* Otherwise, the destination is C string and is never relocated
 1156        automatically.  Thus we don't have to update anything.  */
 1157     ;
 1158 }
 1159 
 1160 
 1161 static void
 1162 coding_alloc_by_realloc (coding, bytes)
 1163      struct coding_system *coding;
 1164      EMACS_INT bytes;
 1165 {
 1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
 1167                                                     coding->dst_bytes + bytes);
 1168   coding->dst_bytes += bytes;
 1169 }
 1170 
 1171 static void
 1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
 1173      struct coding_system *coding;
 1174      EMACS_INT gap_head_used, bytes;
 1175 {
 1176   if (EQ (coding->src_object, coding->dst_object))
 1177     {
 1178       /* The gap may contain the produced data at the head and not-yet
 1179          consumed data at the tail.  To preserve those data, we at
 1180          first make the gap size to zero, then increase the gap
 1181          size.  */
 1182       EMACS_INT add = GAP_SIZE;
 1183 
 1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
 1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 1186       make_gap (bytes);
 1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
 1189     }
 1190   else
 1191     {
 1192       Lisp_Object this_buffer;
 1193 
 1194       this_buffer = Fcurrent_buffer ();
 1195       set_buffer_internal (XBUFFER (coding->dst_object));
 1196       make_gap (bytes);
 1197       set_buffer_internal (XBUFFER (this_buffer));
 1198     }
 1199 }
 1200 
 1201 
 1202 static unsigned char *
 1203 alloc_destination (coding, nbytes, dst)
 1204      struct coding_system *coding;
 1205      EMACS_INT nbytes;
 1206      unsigned char *dst;
 1207 {
 1208   EMACS_INT offset = dst - coding->destination;
 1209 
 1210   if (BUFFERP (coding->dst_object))
 1211     {
 1212       struct buffer *buf = XBUFFER (coding->dst_object);
 1213 
 1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
 1215     }
 1216   else
 1217     coding_alloc_by_realloc (coding, nbytes);
 1218   coding_set_destination (coding);
 1219   dst = coding->destination + offset;
 1220   return dst;
 1221 }
 1222 
 1223 /** Macros for annotations.  */
 1224 
 1225 /* An annotation data is stored in the array coding->charbuf in this
 1226    format:
 1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
 1228    LENGTH is the number of elements in the annotation.
 1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
 1230    NCHARS is the number of characters in the text annotated.
 1231 
 1232    The format of the following elements depend on ANNOTATION_MASK.
 1233 
 1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
 1235    follows:
 1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
 1237 
 1238    NBYTES is the number of bytes specified in the header part of
 1239    old-style emacs-mule encoding, or 0 for the other kind of
 1240    composition.
 1241 
 1242    METHOD is one of enum composition_method.
 1243 
 1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
 1245    rules.
 1246 
 1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
 1248    follows.
 1249 
 1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
 1251    recover from an invalid annotation, and should be skipped by
 1252    produce_annotation.  */
 1253 
 1254 /* Maximum length of the header of annotation data.  */
 1255 #define MAX_ANNOTATION_LENGTH 5
 1256 
 1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
 1258   do {                                                  \
 1259     *(buf)++ = -(len);                                  \
 1260     *(buf)++ = (mask);                                  \
 1261     *(buf)++ = (nchars);                                \
 1262     coding->annotated = 1;                              \
 1263   } while (0);
 1264 
 1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
 1266   do {                                                                      \
 1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
 1268     *buf++ = nbytes;                                                        \
 1269     *buf++ = method;                                                        \
 1270   } while (0)
 1271 
 1272 
 1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
 1274   do {                                                                  \
 1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
 1276     *buf++ = id;                                                        \
 1277   } while (0)
 1278 
 1279 
 1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
 1281 
 1282 
 1283 
 1284 
 1285 /*** 3. UTF-8 ***/
 1286 
 1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
 1289    return 0.  */
 1290 
 1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
 1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
 1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
 1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
 1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
 1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
 1297 
 1298 #define UTF_BOM 0xFEFF
 1299 #define UTF_8_BOM_1 0xEF
 1300 #define UTF_8_BOM_2 0xBB
 1301 #define UTF_8_BOM_3 0xBF
 1302 
 1303 static int
 1304 detect_coding_utf_8 (coding, detect_info)
 1305      struct coding_system *coding;
 1306      struct coding_detection_info *detect_info;
 1307 {
 1308   const unsigned char *src = coding->source, *src_base;
 1309   const unsigned char *src_end = coding->source + coding->src_bytes;
 1310   int multibytep = coding->src_multibyte;
 1311   int consumed_chars = 0;
 1312   int bom_found = 0;
 1313   int found = 0;
 1314 
 1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
 1316   /* A coding system of this category is always ASCII compatible.  */
 1317   src += coding->head_ascii;
 1318 
 1319   while (1)
 1320     {
 1321       int c, c1, c2, c3, c4;
 1322 
 1323       src_base = src;
 1324       ONE_MORE_BYTE (c);
 1325       if (c < 0 || UTF_8_1_OCTET_P (c))
 1326         continue;
 1327       ONE_MORE_BYTE (c1);
 1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
 1329         break;
 1330       if (UTF_8_2_OCTET_LEADING_P (c))
 1331         {
 1332           found = 1;
 1333           continue;
 1334         }
 1335       ONE_MORE_BYTE (c2);
 1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
 1337         break;
 1338       if (UTF_8_3_OCTET_LEADING_P (c))
 1339         {
 1340           found = 1;
 1341           if (src_base == coding->source
 1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
 1343             bom_found = 1;
 1344           continue;
 1345         }
 1346       ONE_MORE_BYTE (c3);
 1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
 1348         break;
 1349       if (UTF_8_4_OCTET_LEADING_P (c))
 1350         {
 1351           found = 1;
 1352           continue;
 1353         }
 1354       ONE_MORE_BYTE (c4);
 1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
 1356         break;
 1357       if (UTF_8_5_OCTET_LEADING_P (c))
 1358         {
 1359           found = 1;
 1360           continue;
 1361         }
 1362       break;
 1363     }
 1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
 1365   return 0;
 1366 
 1367  no_more_source:
 1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
 1369     {
 1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
 1371       return 0;
 1372     }
 1373   if (bom_found)
 1374     {
 1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
 1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
 1377     }
 1378   else
 1379     {
 1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
 1381       if (found)
 1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
 1383     }
 1384   return 1;
 1385 }
 1386 
 1387 
 1388 static void
 1389 decode_coding_utf_8 (coding)
 1390      struct coding_system *coding;
 1391 {
 1392   const unsigned char *src = coding->source + coding->consumed;
 1393   const unsigned char *src_end = coding->source + coding->src_bytes;
 1394   const unsigned char *src_base;
 1395   int *charbuf = coding->charbuf + coding->charbuf_used;
 1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 1397   int consumed_chars = 0, consumed_chars_base = 0;
 1398   int multibytep = coding->src_multibyte;
 1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
 1400   Lisp_Object attr, charset_list;
 1401   int eol_crlf =
 1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 1403   int byte_after_cr = -1;
 1404 
 1405   CODING_GET_INFO (coding, attr, charset_list);
 1406 
 1407   if (bom != utf_without_bom)
 1408     {
 1409       int c1, c2, c3;
 1410 
 1411       src_base = src;
 1412       ONE_MORE_BYTE (c1);
 1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
 1414         src = src_base;
 1415       else
 1416         {
 1417           ONE_MORE_BYTE (c2);
 1418           if (! UTF_8_EXTRA_OCTET_P (c2))
 1419             src = src_base;
 1420           else
 1421             {
 1422               ONE_MORE_BYTE (c3);
 1423               if (! UTF_8_EXTRA_OCTET_P (c3))
 1424                 src = src_base;
 1425               else
 1426                 {
 1427                   if ((c1 != UTF_8_BOM_1)
 1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
 1429                     src = src_base;
 1430                   else
 1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
 1432                 }
 1433             }
 1434         }
 1435     }
 1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
 1437 
 1438 
 1439 
 1440   while (1)
 1441     {
 1442       int c, c1, c2, c3, c4, c5;
 1443 
 1444       src_base = src;
 1445       consumed_chars_base = consumed_chars;
 1446 
 1447       if (charbuf >= charbuf_end)
 1448         {
 1449           if (byte_after_cr >= 0)
 1450             src_base--;
 1451           break;
 1452         }
 1453 
 1454       if (byte_after_cr >= 0)
 1455         c1 = byte_after_cr, byte_after_cr = -1;
 1456       else
 1457         ONE_MORE_BYTE (c1);
 1458       if (c1 < 0)
 1459         {
 1460           c = - c1;
 1461         }
 1462       else if (UTF_8_1_OCTET_P(c1))
 1463         {
 1464           if (eol_crlf && c1 == '\r')
 1465             ONE_MORE_BYTE (byte_after_cr);
 1466           c = c1;
 1467         }
 1468       else
 1469         {
 1470           ONE_MORE_BYTE (c2);
 1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
 1472             goto invalid_code;
 1473           if (UTF_8_2_OCTET_LEADING_P (c1))
 1474             {
 1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
 1476               /* Reject overlong sequences here and below.  Encoders
 1477                  producing them are incorrect, they can be misleading,
 1478                  and they mess up read/write invariance.  */
 1479               if (c < 128)
 1480                 goto invalid_code;
 1481             }
 1482           else
 1483             {
 1484               ONE_MORE_BYTE (c3);
 1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
 1486                 goto invalid_code;
 1487               if (UTF_8_3_OCTET_LEADING_P (c1))
 1488                 {
 1489                   c = (((c1 & 0xF) << 12)
 1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
 1491                   if (c < 0x800
 1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
 1493                     goto invalid_code;
 1494                 }
 1495               else
 1496                 {
 1497                   ONE_MORE_BYTE (c4);
 1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
 1499                     goto invalid_code;
 1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
 1501                     {
 1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
 1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
 1504                     if (c < 0x10000)
 1505                       goto invalid_code;
 1506                     }
 1507                   else
 1508                     {
 1509                       ONE_MORE_BYTE (c5);
 1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
 1511                         goto invalid_code;
 1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
 1513                         {
 1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
 1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
 1516                                | (c5 & 0x3F));
 1517                           if ((c > MAX_CHAR) || (c < 0x200000))
 1518                             goto invalid_code;
 1519                         }
 1520                       else
 1521                         goto invalid_code;
 1522                     }
 1523                 }
 1524             }
 1525         }
 1526 
 1527       *charbuf++ = c;
 1528       continue;
 1529 
 1530     invalid_code:
 1531       src = src_base;
 1532       consumed_chars = consumed_chars_base;
 1533       ONE_MORE_BYTE (c);
 1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
 1535       coding->errors++;
 1536     }
 1537 
 1538  no_more_source:
 1539   coding->consumed_char += consumed_chars_base;
 1540   coding->consumed = src_base - coding->source;
 1541   coding->charbuf_used = charbuf - coding->charbuf;
 1542 }
 1543 
 1544 
 1545 static int
 1546 encode_coding_utf_8 (coding)
 1547      struct coding_system *coding;
 1548 {
 1549   int multibytep = coding->dst_multibyte;
 1550   int *charbuf = coding->charbuf;
 1551   int *charbuf_end = charbuf + coding->charbuf_used;
 1552   unsigned char *dst = coding->destination + coding->produced;
 1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 1554   int produced_chars = 0;
 1555   int c;
 1556 
 1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
 1558     {
 1559       ASSURE_DESTINATION (3);
 1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
 1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
 1562     }
 1563 
 1564   if (multibytep)
 1565     {
 1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
 1567 
 1568       while (charbuf < charbuf_end)
 1569         {
 1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
 1571 
 1572           ASSURE_DESTINATION (safe_room);
 1573           c = *charbuf++;
 1574           if (CHAR_BYTE8_P (c))
 1575             {
 1576               c = CHAR_TO_BYTE8 (c);
 1577               EMIT_ONE_BYTE (c);
 1578             }
 1579           else
 1580             {
 1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
 1582               for (p = str; p < pend; p++)
 1583                 EMIT_ONE_BYTE (*p);
 1584             }
 1585         }
 1586     }
 1587   else
 1588     {
 1589       int safe_room = MAX_MULTIBYTE_LENGTH;
 1590 
 1591       while (charbuf < charbuf_end)
 1592         {
 1593           ASSURE_DESTINATION (safe_room);
 1594           c = *charbuf++;
 1595           if (CHAR_BYTE8_P (c))
 1596             *dst++ = CHAR_TO_BYTE8 (c);
 1597           else
 1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
 1599           produced_chars++;
 1600         }
 1601     }
 1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 1603   coding->produced_char += produced_chars;
 1604   coding->produced = dst - coding->destination;
 1605   return 0;
 1606 }
 1607 
 1608 
 1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 1610    Check if a text is encoded in one of UTF-16 based coding systems.
 1611    If it is, return 1, else return 0.  */
 1612 
 1613 #define UTF_16_HIGH_SURROGATE_P(val) \
 1614   (((val) & 0xFC00) == 0xD800)
 1615 
 1616 #define UTF_16_LOW_SURROGATE_P(val) \
 1617   (((val) & 0xFC00) == 0xDC00)
 1618 
 1619 #define UTF_16_INVALID_P(val)   \
 1620   (((val) == 0xFFFE)            \
 1621    || ((val) == 0xFFFF)         \
 1622    || UTF_16_LOW_SURROGATE_P (val))
 1623 
 1624 
 1625 static int
 1626 detect_coding_utf_16 (coding, detect_info)
 1627      struct coding_system *coding;
 1628      struct coding_detection_info *detect_info;
 1629 {
 1630   const unsigned char *src = coding->source, *src_base = src;
 1631   const unsigned char *src_end = coding->source + coding->src_bytes;
 1632   int multibytep = coding->src_multibyte;
 1633   int consumed_chars = 0;
 1634   int c1, c2;
 1635 
 1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
 1637   if (coding->mode & CODING_MODE_LAST_BLOCK
 1638       && (coding->src_chars & 1))
 1639     {
 1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
 1641       return 0;
 1642     }
 1643 
 1644   TWO_MORE_BYTES (c1, c2);
 1645   if ((c1 == 0xFF) && (c2 == 0xFE))
 1646     {
 1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
 1648                              | CATEGORY_MASK_UTF_16_AUTO);
 1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
 1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
 1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
 1652     }
 1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
 1654     {
 1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
 1656                              | CATEGORY_MASK_UTF_16_AUTO);
 1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
 1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
 1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
 1660     }
 1661   else if (c2 < 0)
 1662     {
 1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
 1664       return 0;
 1665     }
 1666   else
 1667     {
 1668       /* We check the dispersion of Eth and Oth bytes where E is even and
 1669          O is odd.  If both are high, we assume binary data.*/
 1670       unsigned char e[256], o[256];
 1671       unsigned e_num = 1, o_num = 1;
 1672 
 1673       memset (e, 0, 256);
 1674       memset (o, 0, 256);
 1675       e[c1] = 1;
 1676       o[c2] = 1;
 1677 
 1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
 1679                                 |CATEGORY_MASK_UTF_16_BE
 1680                                 | CATEGORY_MASK_UTF_16_LE);
 1681 
 1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
 1683              != CATEGORY_MASK_UTF_16)
 1684         {
 1685           TWO_MORE_BYTES (c1, c2);
 1686           if (c2 < 0)
 1687             break;
 1688           if (! e[c1])
 1689             {
 1690               e[c1] = 1;
 1691               e_num++;
 1692               if (e_num >= 128)
 1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
 1694             }
 1695           if (! o[c2])
 1696             {
 1697               o[c2] = 1;
 1698               o_num++;
 1699               if (o_num >= 128)
 1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
 1701             }
 1702         }
 1703       return 0;
 1704     }
 1705 
 1706  no_more_source:
 1707   return 1;
 1708 }
 1709 
 1710 static void
 1711 decode_coding_utf_16 (coding)
 1712      struct coding_system *coding;
 1713 {
 1714   const unsigned char *src = coding->source + coding->consumed;
 1715   const unsigned char *src_end = coding->source + coding->src_bytes;
 1716   const unsigned char *src_base;
 1717   int *charbuf = coding->charbuf + coding->charbuf_used;
 1718   /* We may produces at most 3 chars in one loop.  */
 1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
 1720   int consumed_chars = 0, consumed_chars_base = 0;
 1721   int multibytep = coding->src_multibyte;
 1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
 1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
 1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
 1725   Lisp_Object attr, charset_list;
 1726   int eol_crlf =
 1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
 1729 
 1730   CODING_GET_INFO (coding, attr, charset_list);
 1731 
 1732   if (bom == utf_with_bom)
 1733     {
 1734       int c, c1, c2;
 1735 
 1736       src_base = src;
 1737       ONE_MORE_BYTE (c1);
 1738       ONE_MORE_BYTE (c2);
 1739       c = (c1 << 8) | c2;
 1740 
 1741       if (endian == utf_16_big_endian
 1742           ? c != 0xFEFF : c != 0xFFFE)
 1743         {
 1744           /* The first two bytes are not BOM.  Treat them as bytes
 1745              for a normal character.  */
 1746           src = src_base;
 1747           coding->errors++;
 1748         }
 1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
 1750     }
 1751   else if (bom == utf_detect_bom)
 1752     {
 1753       /* We have already tried to detect BOM and failed in
 1754          detect_coding.  */
 1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
 1756     }
 1757 
 1758   while (1)
 1759     {
 1760       int c, c1, c2;
 1761 
 1762       src_base = src;
 1763       consumed_chars_base = consumed_chars;
 1764 
 1765       if (charbuf >= charbuf_end)
 1766         {
 1767           if (byte_after_cr1 >= 0)
 1768             src_base -= 2;
 1769           break;
 1770         }
 1771 
 1772       if (byte_after_cr1 >= 0)
 1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
 1774       else
 1775         ONE_MORE_BYTE (c1);
 1776       if (c1 < 0)
 1777         {
 1778           *charbuf++ = -c1;
 1779           continue;
 1780         }
 1781       if (byte_after_cr2 >= 0)
 1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
 1783       else
 1784         ONE_MORE_BYTE (c2);
 1785       if (c2 < 0)
 1786         {
 1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
 1788           *charbuf++ = -c2;
 1789           continue;
 1790         }
 1791       c = (endian == utf_16_big_endian
 1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
 1793 
 1794       if (surrogate)
 1795         {
 1796           if (! UTF_16_LOW_SURROGATE_P (c))
 1797             {
 1798               if (endian == utf_16_big_endian)
 1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
 1800               else
 1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
 1802               *charbuf++ = c1;
 1803               *charbuf++ = c2;
 1804               coding->errors++;
 1805               if (UTF_16_HIGH_SURROGATE_P (c))
 1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
 1807               else
 1808                 *charbuf++ = c;
 1809             }
 1810           else
 1811             {
 1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
 1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
 1814               *charbuf++ = 0x10000 + c;
 1815             }
 1816         }
 1817       else
 1818         {
 1819           if (UTF_16_HIGH_SURROGATE_P (c))
 1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
 1821           else
 1822             {
 1823               if (eol_crlf && c == '\r')
 1824                 {
 1825                   ONE_MORE_BYTE (byte_after_cr1);
 1826                   ONE_MORE_BYTE (byte_after_cr2);
 1827                 }
 1828               *charbuf++ = c;
 1829             }
 1830         }
 1831     }
 1832 
 1833  no_more_source:
 1834   coding->consumed_char += consumed_chars_base;
 1835   coding->consumed = src_base - coding->source;
 1836   coding->charbuf_used = charbuf - coding->charbuf;
 1837 }
 1838 
 1839 static int
 1840 encode_coding_utf_16 (coding)
 1841      struct coding_system *coding;
 1842 {
 1843   int multibytep = coding->dst_multibyte;
 1844   int *charbuf = coding->charbuf;
 1845   int *charbuf_end = charbuf + coding->charbuf_used;
 1846   unsigned char *dst = coding->destination + coding->produced;
 1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 1848   int safe_room = 8;
 1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
 1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
 1851   int produced_chars = 0;
 1852   Lisp_Object attrs, charset_list;
 1853   int c;
 1854 
 1855   CODING_GET_INFO (coding, attrs, charset_list);
 1856 
 1857   if (bom != utf_without_bom)
 1858     {
 1859       ASSURE_DESTINATION (safe_room);
 1860       if (big_endian)
 1861         EMIT_TWO_BYTES (0xFE, 0xFF);
 1862       else
 1863         EMIT_TWO_BYTES (0xFF, 0xFE);
 1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
 1865     }
 1866 
 1867   while (charbuf < charbuf_end)
 1868     {
 1869       ASSURE_DESTINATION (safe_room);
 1870       c = *charbuf++;
 1871       if (c > MAX_UNICODE_CHAR)
 1872         c = coding->default_char;
 1873 
 1874       if (c < 0x10000)
 1875         {
 1876           if (big_endian)
 1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
 1878           else
 1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
 1880         }
 1881       else
 1882         {
 1883           int c1, c2;
 1884 
 1885           c -= 0x10000;
 1886           c1 = (c >> 10) + 0xD800;
 1887           c2 = (c & 0x3FF) + 0xDC00;
 1888           if (big_endian)
 1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
 1890           else
 1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
 1892         }
 1893     }
 1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 1895   coding->produced = dst - coding->destination;
 1896   coding->produced_char += produced_chars;
 1897   return 0;
 1898 }
 1899 
 1900 
 1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
 1902 
 1903 /* Emacs' internal format for representation of multiple character
 1904    sets is a kind of multi-byte encoding, i.e. characters are
 1905    represented by variable-length sequences of one-byte codes.
 1906 
 1907    ASCII characters and control characters (e.g. `tab', `newline') are
 1908    represented by one-byte sequences which are their ASCII codes, in
 1909    the range 0x00 through 0x7F.
 1910 
 1911    8-bit characters of the range 0x80..0x9F are represented by
 1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 1913    code + 0x20).
 1914 
 1915    8-bit characters of the range 0xA0..0xFF are represented by
 1916    one-byte sequences which are their 8-bit code.
 1917 
 1918    The other characters are represented by a sequence of `base
 1919    leading-code', optional `extended leading-code', and one or two
 1920    `position-code's.  The length of the sequence is determined by the
 1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 1922    whereas extended leading-code and position-code take the range 0xA0
 1923    through 0xFF.  See `charset.h' for more details about leading-code
 1924    and position-code.
 1925 
 1926    --- CODE RANGE of Emacs' internal format ---
 1927    character set        range
 1928    -------------        -----
 1929    ascii                0x00..0x7F
 1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 1931    eight-bit-graphic    0xA0..0xBF
 1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 1933    ---------------------------------------------
 1934 
 1935    As this is the internal character representation, the format is
 1936    usually not used externally (i.e. in a file or in a data sent to a
 1937    process).  But, it is possible to have a text externally in this
 1938    format (i.e. by encoding by the coding system `emacs-mule').
 1939 
 1940    In that case, a sequence of one-byte codes has a slightly different
 1941    form.
 1942 
 1943    At first, all characters in eight-bit-control are represented by
 1944    one-byte sequences which are their 8-bit code.
 1945 
 1946    Next, character composition data are represented by the byte
 1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 1948    where,
 1949         METHOD is 0xF2 plus one of composition method (enum
 1950         composition_method),
 1951 
 1952         BYTES is 0xA0 plus a byte length of this composition data,
 1953 
 1954         CHARS is 0xA0 plus a number of characters composed by this
 1955         data,
 1956 
 1957         COMPONENTs are characters of multibye form or composition
 1958         rules encoded by two-byte of ASCII codes.
 1959 
 1960    In addition, for backward compatibility, the following formats are
 1961    also recognized as composition data on decoding.
 1962 
 1963    0x80 MSEQ ...
 1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 1965 
 1966    Here,
 1967         MSEQ is a multibyte form but in these special format:
 1968           ASCII: 0xA0 ASCII_CODE+0x80,
 1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 1970         RULE is a one byte code of the range 0xA0..0xF0 that
 1971         represents a composition rule.
 1972   */
 1973 
 1974 char emacs_mule_bytes[256];
 1975 
 1976 
 1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
 1979    else return 0.  */
 1980 
 1981 static int
 1982 detect_coding_emacs_mule (coding, detect_info)
 1983      struct coding_system *coding;
 1984      struct coding_detection_info *detect_info;
 1985 {
 1986   const unsigned char *src = coding->source, *src_base;
 1987   const unsigned char *src_end = coding->source + coding->src_bytes;
 1988   int multibytep = coding->src_multibyte;
 1989   int consumed_chars = 0;
 1990   int c;
 1991   int found = 0;
 1992 
 1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
 1994   /* A coding system of this category is always ASCII compatible.  */
 1995   src += coding->head_ascii;
 1996 
 1997   while (1)
 1998     {
 1999       src_base = src;
 2000       ONE_MORE_BYTE (c);
 2001       if (c < 0)
 2002         continue;
 2003       if (c == 0x80)
 2004         {
 2005           /* Perhaps the start of composite character.  We simply skip
 2006              it because analyzing it is too heavy for detecting.  But,
 2007              at least, we check that the composite character
 2008              constitutes of more than 4 bytes.  */
 2009           const unsigned char *src_base;
 2010 
 2011         repeat:
 2012           src_base = src;
 2013           do
 2014             {
 2015               ONE_MORE_BYTE (c);
 2016             }
 2017           while (c >= 0xA0);
 2018 
 2019           if (src - src_base <= 4)
 2020             break;
 2021           found = CATEGORY_MASK_EMACS_MULE;
 2022           if (c == 0x80)
 2023             goto repeat;
 2024         }
 2025 
 2026       if (c < 0x80)
 2027         {
 2028           if (c < 0x20
 2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
 2030             break;
 2031         }
 2032       else
 2033         {
 2034           int more_bytes = emacs_mule_bytes[*src_base] - 1;
 2035 
 2036           while (more_bytes > 0)
 2037             {
 2038               ONE_MORE_BYTE (c);
 2039               if (c < 0xA0)
 2040                 {
 2041                   src--;        /* Unread the last byte.  */
 2042                   break;
 2043                 }
 2044               more_bytes--;
 2045             }
 2046           if (more_bytes != 0)
 2047             break;
 2048           found = CATEGORY_MASK_EMACS_MULE;
 2049         }
 2050     }
 2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
 2052   return 0;
 2053 
 2054  no_more_source:
 2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
 2056     {
 2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
 2058       return 0;
 2059     }
 2060   detect_info->found |= found;
 2061   return 1;
 2062 }
 2063 
 2064 
 2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
 2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
 2067    RULE described above, decode it and return the negative value of
 2068    the decoded character or rule.  If an invalid byte is found, return
 2069    -1.  If SRC is too short, return -2.  */
 2070 
 2071 int
 2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
 2073      struct coding_system *coding;
 2074      const unsigned char *src;
 2075      int *nbytes, *nchars, *id;
 2076      struct composition_status *cmp_status;
 2077 {
 2078   const unsigned char *src_end = coding->source + coding->src_bytes;
 2079   const unsigned char *src_base = src;
 2080   int multibytep = coding->src_multibyte;
 2081   struct charset *charset;
 2082   unsigned code;
 2083   int c;
 2084   int consumed_chars = 0;
 2085   int mseq_found = 0;
 2086 
 2087   ONE_MORE_BYTE (c);
 2088   if (c < 0)
 2089     {
 2090       c = -c;
 2091       charset = emacs_mule_charset[0];
 2092     }
 2093   else
 2094     {
 2095       if (c >= 0xA0)
 2096         {
 2097           if (cmp_status->state != COMPOSING_NO
 2098               && cmp_status->old_form)
 2099             {
 2100               if (cmp_status->state == COMPOSING_CHAR)
 2101                 {
 2102                   if (c == 0xA0)
 2103                     {
 2104                       ONE_MORE_BYTE (c);
 2105                       c -= 0x80;
 2106                       if (c < 0)
 2107                         goto invalid_code;
 2108                     }
 2109                   else
 2110                     c -= 0x20;
 2111                   mseq_found = 1;
 2112                 }
 2113               else
 2114                 {
 2115                   *nbytes = src - src_base;
 2116                   *nchars = consumed_chars;
 2117                   return -c;
 2118                 }
 2119             }
 2120           else
 2121             goto invalid_code;
 2122         }
 2123 
 2124       switch (emacs_mule_bytes[c])
 2125         {
 2126         case 2:
 2127           if (! (charset = emacs_mule_charset[c]))
 2128             goto invalid_code;
 2129           ONE_MORE_BYTE (c);
 2130           if (c < 0xA0)
 2131             goto invalid_code;
 2132           code = c & 0x7F;
 2133           break;
 2134 
 2135         case 3:
 2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
 2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
 2138             {
 2139               ONE_MORE_BYTE (c);
 2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
 2141                 goto invalid_code;
 2142               ONE_MORE_BYTE (c);
 2143               if (c < 0xA0)
 2144                 goto invalid_code;
 2145               code = c & 0x7F;
 2146             }
 2147           else
 2148             {
 2149               if (! (charset = emacs_mule_charset[c]))
 2150                 goto invalid_code;
 2151               ONE_MORE_BYTE (c);
 2152               if (c < 0xA0)
 2153                 goto invalid_code;
 2154               code = (c & 0x7F) << 8;
 2155               ONE_MORE_BYTE (c);
 2156               if (c < 0xA0)
 2157                 goto invalid_code;
 2158               code |= c & 0x7F;
 2159             }
 2160           break;
 2161 
 2162         case 4:
 2163           ONE_MORE_BYTE (c);
 2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
 2165             goto invalid_code;
 2166           ONE_MORE_BYTE (c);
 2167           if (c < 0xA0)
 2168             goto invalid_code;
 2169           code = (c & 0x7F) << 8;
 2170           ONE_MORE_BYTE (c);
 2171           if (c < 0xA0)
 2172             goto invalid_code;
 2173           code |= c & 0x7F;
 2174           break;
 2175 
 2176         case 1:
 2177           code = c;
 2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
 2179                                      ? charset_ascii : charset_eight_bit);
 2180           break;
 2181 
 2182         default:
 2183           abort ();
 2184         }
 2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
 2186       if (c < 0)
 2187         goto invalid_code;
 2188     }
 2189   *nbytes = src - src_base;
 2190   *nchars = consumed_chars;
 2191   if (id)
 2192     *id = charset->id;
 2193   return (mseq_found ? -c : c);
 2194 
 2195  no_more_source:
 2196   return -2;
 2197 
 2198  invalid_code:
 2199   return -1;
 2200 }
 2201 
 2202 
 2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 2204 
 2205 /* Handle these composition sequence ('|': the end of header elements,
 2206    BYTES and CHARS >= 0xA0):
 2207 
 2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
 2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
 2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
 2211 
 2212    and these old form:
 2213   
 2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
 2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
 2216 
 2217    When the starter 0x80 and the following header elements are found,
 2218    this annotation header is produced.
 2219 
 2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
 2221 
 2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
 2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
 2224 
 2225    Then, upon reading the following elements, these codes are produced
 2226    until the composition end is found:
 2227 
 2228    (1) CHAR ... CHAR
 2229    (2) ALT ... ALT CHAR ... CHAR
 2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
 2231    (4) CHAR ... CHAR
 2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
 2233 
 2234    When the composition end is found, LENGTH and NCHARS in the
 2235    annotation header is updated as below:
 2236 
 2237    (1) LENGTH: unchanged, NCHARS: unchanged
 2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
 2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
 2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
 2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
 2242 
 2243    If an error is found while composing, the annotation header is
 2244    changed to the original composition header (plus filler -1s) as
 2245    below:
 2246 
 2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
 2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
 2249 
 2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
 2251    byte sequence as below:
 2252         o the original byte sequence is B: [ B -1 ]
 2253         o the original byte sequence is B1 B2: [ B1 B2 ]
 2254 
 2255    Most of the routines are implemented by macros because many
 2256    variables and labels in the caller decode_coding_emacs_mule must be
 2257    accessible, and they are usually called just once (thus doesn't
 2258    increase the size of compiled object).  */
 2259 
 2260 /* Decode a composition rule represented by C as a component of
 2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
 2262    rule. */
 2263 
 2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
 2265   do {                                                  \
 2266     int gref, nref;                                     \
 2267                                                         \
 2268     c -= 0xA0;                                          \
 2269     if (c < 0 || c >= 81)                               \
 2270       goto invalid_code;                                \
 2271     gref = c / 9, nref = c % 9;                         \
 2272     if (gref == 4) gref = 10;                           \
 2273     if (nref == 4) nref = 10;                           \
 2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
 2275   } while (0)
 2276 
 2277 
 2278 /* Decode a composition rule represented by C and the following byte
 2279    at SRC as a component of composition sequence of Emacs 21 style.
 2280    Set RULE to the decoded rule.  */
 2281 
 2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
 2283   do {                                                  \
 2284     int gref, nref;                                     \
 2285                                                         \
 2286     gref = c - 0x20;                                    \
 2287     if (gref < 0 || gref >= 81)                         \
 2288       goto invalid_code;                                \
 2289     ONE_MORE_BYTE (c);                                  \
 2290     nref = c - 0x20;                                    \
 2291     if (nref < 0 || nref >= 81)                         \
 2292       goto invalid_code;                                \
 2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
 2294   } while (0)
 2295 
 2296 
 2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
 2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
 2299    byte length of this composition information, CHARS is the number of
 2300    characters composed by this composition.  */
 2301 
 2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
 2303   do {                                                                  \
 2304     enum composition_method method = c - 0xF2;                          \
 2305     int *charbuf_base = charbuf;                                        \
 2306     int nbytes, nchars;                                                 \
 2307                                                                         \
 2308     ONE_MORE_BYTE (c);                                                  \
 2309     if (c < 0)                                                          \
 2310       goto invalid_code;                                                \
 2311     nbytes = c - 0xA0;                                                  \
 2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
 2313       goto invalid_code;                                                \
 2314     ONE_MORE_BYTE (c);                                                  \
 2315     nchars = c - 0xA0;                                                  \
 2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
 2317       goto invalid_code;                                                \
 2318     cmp_status->old_form = 0;                                           \
 2319     cmp_status->method = method;                                        \
 2320     if (method == COMPOSITION_RELATIVE)                                 \
 2321       cmp_status->state = COMPOSING_CHAR;                               \
 2322     else                                                                \
 2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
 2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
 2325     cmp_status->nchars = nchars;                                        \
 2326     cmp_status->ncomps = nbytes - 4;                                    \
 2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
 2328   } while (0)
 2329 
 2330 
 2331 /* Start of Emacs 20 style format for relative composition.  */
 2332 
 2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
 2334   do {                                                          \
 2335     cmp_status->old_form = 1;                                   \
 2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
 2337     cmp_status->state = COMPOSING_CHAR;                         \
 2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
 2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
 2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
 2341   } while (0)
 2342 
 2343 
 2344 /* Start of Emacs 20 style format for rule-base composition.  */
 2345 
 2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
 2347   do {                                                          \
 2348     cmp_status->old_form = 1;                                   \
 2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
 2350     cmp_status->state = COMPOSING_CHAR;                         \
 2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
 2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
 2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
 2354   } while (0)
 2355 
 2356 
 2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
 2358   do {                                                  \
 2359     const unsigned char *current_src = src;             \
 2360                                                         \
 2361     ONE_MORE_BYTE (c);                                  \
 2362     if (c < 0)                                          \
 2363       goto invalid_code;                                \
 2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
 2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
 2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
 2367     else if (c < 0xA0)                                  \
 2368       goto invalid_code;                                \
 2369     else if (c < 0xC0)                                  \
 2370       {                                                 \
 2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
 2372         /* Re-read C as a composition component.  */    \
 2373         src = current_src;                              \
 2374       }                                                 \
 2375     else if (c == 0xFF)                                 \
 2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
 2377     else                                                \
 2378       goto invalid_code;                                \
 2379   } while (0)
 2380 
 2381 #define EMACS_MULE_COMPOSITION_END()                            \
 2382   do {                                                          \
 2383     int idx = - cmp_status->length;                             \
 2384                                                                 \
 2385     if (cmp_status->old_form)                                   \
 2386       charbuf[idx + 2] = cmp_status->nchars;                    \
 2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
 2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
 2389     cmp_status->state = COMPOSING_NO;                           \
 2390   } while (0)
 2391 
 2392 
 2393 static int
 2394 emacs_mule_finish_composition (charbuf, cmp_status)
 2395      int *charbuf;
 2396      struct composition_status *cmp_status;
 2397 {
 2398   int idx = - cmp_status->length;
 2399   int new_chars;
 2400 
 2401   if (cmp_status->old_form && cmp_status->nchars > 0)
 2402     {
 2403       charbuf[idx + 2] = cmp_status->nchars;
 2404       new_chars = 0;
 2405       if (cmp_status->method == COMPOSITION_WITH_RULE
 2406           && cmp_status->state == COMPOSING_CHAR)
 2407         {
 2408           /* The last rule was invalid.  */
 2409           int rule = charbuf[-1] + 0xA0;
 2410 
 2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
 2412           charbuf[-1] = -1;
 2413           new_chars = 1;
 2414         }
 2415     }
 2416   else
 2417     {
 2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
 2419 
 2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
 2421         {
 2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
 2423           charbuf[idx++] = -3;
 2424           charbuf[idx++] = 0;
 2425           new_chars = 1;
 2426         }
 2427       else
 2428         {
 2429           int nchars = charbuf[idx + 1] + 0xA0;
 2430           int nbytes = charbuf[idx + 2] + 0xA0;
 2431 
 2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
 2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
 2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
 2435           charbuf[idx++] = -1;
 2436           new_chars = 4;
 2437         }
 2438     }
 2439   cmp_status->state = COMPOSING_NO;
 2440   return new_chars;
 2441 }
 2442 
 2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
 2444   do {                                                                    \
 2445     if (cmp_status->state != COMPOSING_NO)                                \
 2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
 2447   } while (0)
 2448 
 2449 
 2450 static void
 2451 decode_coding_emacs_mule (coding)
 2452      struct coding_system *coding;
 2453 {
 2454   const unsigned char *src = coding->source + coding->consumed;
 2455   const unsigned char *src_end = coding->source + coding->src_bytes;
 2456   const unsigned char *src_base;
 2457   int *charbuf = coding->charbuf + coding->charbuf_used;
 2458   /* We may produce two annocations (charset and composition) in one
 2459      loop and one more charset annocation at the end.  */
 2460   int *charbuf_end
 2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
 2462   int consumed_chars = 0, consumed_chars_base;
 2463   int multibytep = coding->src_multibyte;
 2464   Lisp_Object attrs, charset_list;
 2465   int char_offset = coding->produced_char;
 2466   int last_offset = char_offset;
 2467   int last_id = charset_ascii;
 2468   int eol_crlf =
 2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 2470   int byte_after_cr = -1;
 2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
 2472 
 2473   CODING_GET_INFO (coding, attrs, charset_list);
 2474 
 2475   if (cmp_status->state != COMPOSING_NO)
 2476     {
 2477       int i;
 2478 
 2479       for (i = 0; i < cmp_status->length; i++)
 2480         *charbuf++ = cmp_status->carryover[i];
 2481       coding->annotated = 1;
 2482     }
 2483 
 2484   while (1)
 2485     {
 2486       int c, id;
 2487 
 2488       src_base = src;
 2489       consumed_chars_base = consumed_chars;
 2490 
 2491       if (charbuf >= charbuf_end)
 2492         {
 2493           if (byte_after_cr >= 0)
 2494             src_base--;
 2495           break;
 2496         }
 2497 
 2498       if (byte_after_cr >= 0)
 2499         c = byte_after_cr, byte_after_cr = -1;
 2500       else
 2501         ONE_MORE_BYTE (c);
 2502 
 2503       if (c < 0 || c == 0x80)
 2504         {
 2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2506           if (c < 0)
 2507             {
 2508               *charbuf++ = -c;
 2509               char_offset++;
 2510             }
 2511           else
 2512             DECODE_EMACS_MULE_COMPOSITION_START ();
 2513           continue;
 2514         }
 2515 
 2516       if (c < 0x80)
 2517         {
 2518           if (eol_crlf && c == '\r')
 2519             ONE_MORE_BYTE (byte_after_cr);
 2520           id = charset_ascii;
 2521           if (cmp_status->state != COMPOSING_NO)
 2522             {
 2523               if (cmp_status->old_form)
 2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
 2526                 cmp_status->ncomps--;
 2527             }
 2528         }
 2529       else
 2530         {
 2531           int nchars, nbytes;
 2532           /* emacs_mule_char can load a charset map from a file, which
 2533              allocates a large structure and might cause buffer text
 2534              to be relocated as result.  Thus, we need to remember the
 2535              original pointer to buffer text, and fixup all related
 2536              pointers after the call.  */
 2537           const unsigned char *orig = coding->source;
 2538           EMACS_INT offset;
 2539 
 2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
 2541                                cmp_status);
 2542           offset = coding->source - orig;
 2543           if (offset)
 2544             {
 2545               src += offset;
 2546               src_base += offset;
 2547               src_end += offset;
 2548             }
 2549           if (c < 0)
 2550             {
 2551               if (c == -1)
 2552                 goto invalid_code;
 2553               if (c == -2)
 2554                 break;
 2555             }
 2556           src = src_base + nbytes;
 2557           consumed_chars = consumed_chars_base + nchars;
 2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
 2559             cmp_status->ncomps -= nchars;
 2560         }
 2561 
 2562       /* Now if C >= 0, we found a normally encoded characer, if C <
 2563          0, we found an old-style composition component character or
 2564          rule.  */
 2565 
 2566       if (cmp_status->state == COMPOSING_NO)
 2567         {
 2568           if (last_id != id)
 2569             {
 2570               if (last_id != charset_ascii)
 2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
 2572                                   last_id);
 2573               last_id = id;
 2574               last_offset = char_offset;
 2575             }
 2576           *charbuf++ = c;
 2577           char_offset++;
 2578         }
 2579       else if (cmp_status->state == COMPOSING_CHAR)
 2580         {
 2581           if (cmp_status->old_form)
 2582             {
 2583               if (c >= 0)
 2584                 {
 2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2586                   *charbuf++ = c;
 2587                   char_offset++;
 2588                 }
 2589               else
 2590                 {
 2591                   *charbuf++ = -c;
 2592                   cmp_status->nchars++;
 2593                   cmp_status->length++;
 2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
 2595                     EMACS_MULE_COMPOSITION_END ();
 2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
 2597                     cmp_status->state = COMPOSING_RULE;
 2598                 }
 2599             }
 2600           else
 2601             {
 2602               *charbuf++ = c;
 2603               cmp_status->length++;
 2604               cmp_status->nchars--;
 2605               if (cmp_status->nchars == 0)
 2606                 EMACS_MULE_COMPOSITION_END ();
 2607             }
 2608         }
 2609       else if (cmp_status->state == COMPOSING_RULE)
 2610         {
 2611           int rule;
 2612 
 2613           if (c >= 0)
 2614             {
 2615               EMACS_MULE_COMPOSITION_END ();
 2616               *charbuf++ = c;
 2617               char_offset++;
 2618             }
 2619           else
 2620             {
 2621               c = -c;
 2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
 2623               if (rule < 0)
 2624                 goto invalid_code;
 2625               *charbuf++ = -2;
 2626               *charbuf++ = rule;
 2627               cmp_status->length += 2;
 2628               cmp_status->state = COMPOSING_CHAR;
 2629             }
 2630         }
 2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
 2632         {
 2633           *charbuf++ = c;
 2634           cmp_status->length++;
 2635           if (cmp_status->ncomps == 0)
 2636             cmp_status->state = COMPOSING_CHAR;
 2637           else if (cmp_status->ncomps > 0)
 2638             {
 2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
 2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
 2641             }
 2642           else
 2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2644         }
 2645       else                      /* COMPOSING_COMPONENT_RULE */
 2646         {
 2647           int rule;
 2648 
 2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
 2650           if (rule < 0)
 2651             goto invalid_code;
 2652           *charbuf++ = -2;
 2653           *charbuf++ = rule;
 2654           cmp_status->length += 2;
 2655           cmp_status->ncomps--;
 2656           if (cmp_status->ncomps > 0)
 2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
 2658           else
 2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2660         }
 2661       continue;
 2662 
 2663     retry:
 2664       src = src_base;
 2665       consumed_chars = consumed_chars_base;
 2666       continue;
 2667 
 2668     invalid_code:
 2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2670       src = src_base;
 2671       consumed_chars = consumed_chars_base;
 2672       ONE_MORE_BYTE (c);
 2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
 2674       char_offset++;
 2675       coding->errors++;
 2676     }
 2677 
 2678  no_more_source:
 2679   if (cmp_status->state != COMPOSING_NO)
 2680     {
 2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
 2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
 2683       else
 2684         {
 2685           int i;
 2686 
 2687           charbuf -= cmp_status->length;
 2688           for (i = 0; i < cmp_status->length; i++)
 2689             cmp_status->carryover[i] = charbuf[i];
 2690         }
 2691     }
 2692   if (last_id != charset_ascii)
 2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 2694   coding->consumed_char += consumed_chars_base;
 2695   coding->consumed = src_base - coding->source;
 2696   coding->charbuf_used = charbuf - coding->charbuf;
 2697 }
 2698 
 2699 
 2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
 2701   do {                                          \
 2702     if (id < 0xA0)                              \
 2703       codes[0] = id, codes[1] = 0;              \
 2704     else if (id < 0xE0)                         \
 2705       codes[0] = 0x9A, codes[1] = id;           \
 2706     else if (id < 0xF0)                         \
 2707       codes[0] = 0x9B, codes[1] = id;           \
 2708     else if (id < 0xF5)                         \
 2709       codes[0] = 0x9C, codes[1] = id;           \
 2710     else                                        \
 2711       codes[0] = 0x9D, codes[1] = id;           \
 2712   } while (0);
 2713 
 2714 
 2715 static int
 2716 encode_coding_emacs_mule (coding)
 2717      struct coding_system *coding;
 2718 {
 2719   int multibytep = coding->dst_multibyte;
 2720   int *charbuf = coding->charbuf;
 2721   int *charbuf_end = charbuf + coding->charbuf_used;
 2722   unsigned char *dst = coding->destination + coding->produced;
 2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 2724   int safe_room = 8;
 2725   int produced_chars = 0;
 2726   Lisp_Object attrs, charset_list;
 2727   int c;
 2728   int preferred_charset_id = -1;
 2729 
 2730   CODING_GET_INFO (coding, attrs, charset_list);
 2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
 2732     {
 2733       CODING_ATTR_CHARSET_LIST (attrs)
 2734         = charset_list = Vemacs_mule_charset_list;
 2735     }
 2736 
 2737   while (charbuf < charbuf_end)
 2738     {
 2739       ASSURE_DESTINATION (safe_room);
 2740       c = *charbuf++;
 2741 
 2742       if (c < 0)
 2743         {
 2744           /* Handle an annotation.  */
 2745           switch (*charbuf)
 2746             {
 2747             case CODING_ANNOTATE_COMPOSITION_MASK:
 2748               /* Not yet implemented.  */
 2749               break;
 2750             case CODING_ANNOTATE_CHARSET_MASK:
 2751               preferred_charset_id = charbuf[3];
 2752               if (preferred_charset_id >= 0
 2753                   && NILP (Fmemq (make_number (preferred_charset_id),
 2754                                   charset_list)))
 2755                 preferred_charset_id = -1;
 2756               break;
 2757             default:
 2758               abort ();
 2759             }
 2760           charbuf += -c - 1;
 2761           continue;
 2762         }
 2763 
 2764       if (ASCII_CHAR_P (c))
 2765         EMIT_ONE_ASCII_BYTE (c);
 2766       else if (CHAR_BYTE8_P (c))
 2767         {
 2768           c = CHAR_TO_BYTE8 (c);
 2769           EMIT_ONE_BYTE (c);
 2770         }
 2771       else
 2772         {
 2773           struct charset *charset;
 2774           unsigned code;
 2775           int dimension;
 2776           int emacs_mule_id;
 2777           unsigned char leading_codes[2];
 2778 
 2779           if (preferred_charset_id >= 0)
 2780             {
 2781               charset = CHARSET_FROM_ID (preferred_charset_id);
 2782               if (CHAR_CHARSET_P (c, charset))
 2783                 code = ENCODE_CHAR (charset, c);
 2784               else
 2785                 charset = char_charset (c, charset_list, &code);
 2786             }
 2787           else
 2788             charset = char_charset (c, charset_list, &code);
 2789           if (! charset)
 2790             {
 2791               c = coding->default_char;
 2792               if (ASCII_CHAR_P (c))
 2793                 {
 2794                   EMIT_ONE_ASCII_BYTE (c);
 2795                   continue;
 2796                 }
 2797               charset = char_charset (c, charset_list, &code);
 2798             }
 2799           dimension = CHARSET_DIMENSION (charset);
 2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
 2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
 2802           EMIT_ONE_BYTE (leading_codes[0]);
 2803           if (leading_codes[1])
 2804             EMIT_ONE_BYTE (leading_codes[1]);
 2805           if (dimension == 1)
 2806             EMIT_ONE_BYTE (code | 0x80);
 2807           else
 2808             {
 2809               code |= 0x8080;
 2810               EMIT_ONE_BYTE (code >> 8);
 2811               EMIT_ONE_BYTE (code & 0xFF);
 2812             }
 2813         }
 2814     }
 2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 2816   coding->produced_char += produced_chars;
 2817   coding->produced = dst - coding->destination;
 2818   return 0;
 2819 }
 2820 
 2821 
 2822 /*** 7. ISO2022 handlers ***/
 2823 
 2824 /* The following note describes the coding system ISO2022 briefly.
 2825    Since the intention of this note is to help understand the
 2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
 2827    SIMPLIFIED.  For thorough understanding, please refer to the
 2828    original document of ISO2022.  This is equivalent to the standard
 2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
 2830 
 2831    ISO2022 provides many mechanisms to encode several character sets
 2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
 2833    is encoded using bytes less than 128.  This may make the encoded
 2834    text a little bit longer, but the text passes more easily through
 2835    several types of gateway, some of which strip off the MSB (Most
 2836    Significant Bit).
 2837 
 2838    There are two kinds of character sets: control character sets and
 2839    graphic character sets.  The former contain control characters such
 2840    as `newline' and `escape' to provide control functions (control
 2841    functions are also provided by escape sequences).  The latter
 2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
 2843    two control character sets and many graphic character sets.
 2844 
 2845    Graphic character sets are classified into one of the following
 2846    four classes, according to the number of bytes (DIMENSION) and
 2847    number of characters in one dimension (CHARS) of the set:
 2848    - DIMENSION1_CHARS94
 2849    - DIMENSION1_CHARS96
 2850    - DIMENSION2_CHARS94
 2851    - DIMENSION2_CHARS96
 2852 
 2853    In addition, each character set is assigned an identification tag,
 2854    unique for each set, called the "final character" (denoted as <F>
 2855    hereafter).  The <F> of each character set is decided by ECMA(*)
 2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 2857    (0x30..0x3F are for private use only).
 2858 
 2859    Note (*): ECMA = European Computer Manufacturers Association
 2860 
 2861    Here are examples of graphic character sets [NAME(<F>)]:
 2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 2865         o DIMENSION2_CHARS96 -- none for the moment
 2866 
 2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 2868         C0 [0x00..0x1F] -- control character plane 0
 2869         GL [0x20..0x7F] -- graphic character plane 0
 2870         C1 [0x80..0x9F] -- control character plane 1
 2871         GR [0xA0..0xFF] -- graphic character plane 1
 2872 
 2873    A control character set is directly designated and invoked to C0 or
 2874    C1 by an escape sequence.  The most common case is that:
 2875    - ISO646's  control character set is designated/invoked to C0, and
 2876    - ISO6429's control character set is designated/invoked to C1,
 2877    and usually these designations/invocations are omitted in encoded
 2878    text.  In a 7-bit environment, only C0 can be used, and a control
 2879    character for C1 is encoded by an appropriate escape sequence to
 2880    fit into the environment.  All control characters for C1 are
 2881    defined to have corresponding escape sequences.
 2882 
 2883    A graphic character set is at first designated to one of four
 2884    graphic registers (G0 through G3), then these graphic registers are
 2885    invoked to GL or GR.  These designations and invocations can be
 2886    done independently.  The most common case is that G0 is invoked to
 2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 2888    these invocations and designations are omitted in encoded text.
 2889    In a 7-bit environment, only GL can be used.
 2890 
 2891    When a graphic character set of CHARS94 is invoked to GL, codes
 2892    0x20 and 0x7F of the GL area work as control characters SPACE and
 2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 2894    be used.
 2895 
 2896    There are two ways of invocation: locking-shift and single-shift.
 2897    With locking-shift, the invocation lasts until the next different
 2898    invocation, whereas with single-shift, the invocation affects the
 2899    following character only and doesn't affect the locking-shift
 2900    state.  Invocations are done by the following control characters or
 2901    escape sequences:
 2902 
 2903    ----------------------------------------------------------------------
 2904    abbrev  function                  cntrl escape seq   description
 2905    ----------------------------------------------------------------------
 2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 2915    ----------------------------------------------------------------------
 2916    (*) These are not used by any known coding system.
 2917 
 2918    Control characters for these functions are defined by macros
 2919    ISO_CODE_XXX in `coding.h'.
 2920 
 2921    Designations are done by the following escape sequences:
 2922    ----------------------------------------------------------------------
 2923    escape sequence      description
 2924    ----------------------------------------------------------------------
 2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 2941    ----------------------------------------------------------------------
 2942 
 2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 2944    of dimension 1, chars 94, and final character <F>, etc...
 2945 
 2946    Note (*): Although these designations are not allowed in ISO2022,
 2947    Emacs accepts them on decoding, and produces them on encoding
 2948    CHARS96 character sets in a coding system which is characterized as
 2949    7-bit environment, non-locking-shift, and non-single-shift.
 2950 
 2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
 2953 
 2954    Now you may notice that there are a lot of ways of encoding the
 2955    same multilingual text in ISO2022.  Actually, there exist many
 2956    coding systems such as Compound Text (used in X11's inter client
 2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 2959    localized platforms), and all of these are variants of ISO2022.
 2960 
 2961    In addition to the above, Emacs handles two more kinds of escape
 2962    sequences: ISO6429's direction specification and Emacs' private
 2963    sequence for specifying character composition.
 2964 
 2965    ISO6429's direction specification takes the following form:
 2966         o CSI ']'      -- end of the current direction
 2967         o CSI '0' ']'  -- end of the current direction
 2968         o CSI '1' ']'  -- start of left-to-right text
 2969         o CSI '2' ']'  -- start of right-to-left text
 2970    The control character CSI (0x9B: control sequence introducer) is
 2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 2972 
 2973    Character composition specification takes the following form:
 2974         o ESC '0' -- start relative composition
 2975         o ESC '1' -- end composition
 2976         o ESC '2' -- start rule-base composition (*)
 2977         o ESC '3' -- start relative composition with alternate chars  (**)
 2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
 2979   Since these are not standard escape sequences of any ISO standard,
 2980   the use of them with these meanings is restricted to Emacs only.
 2981 
 2982   (*) This form is used only in Emacs 20.7 and older versions,
 2983   but newer versions can safely decode it.
 2984   (**) This form is used only in Emacs 21.1 and newer versions,
 2985   and older versions can't decode it.
 2986 
 2987   Here's a list of example usages of these composition escape
 2988   sequences (categorized by `enum composition_method').
 2989 
 2990   COMPOSITION_RELATIVE:
 2991         ESC 0 CHAR [ CHAR ] ESC 1
 2992   COMPOSITION_WITH_RULE:
 2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
 2994   COMPOSITION_WITH_ALTCHARS:
 2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 2996   COMPOSITION_WITH_RULE_ALTCHARS:
 2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 2998 
 2999 enum iso_code_class_type iso_code_class[256];
 3000 
 3001 #define SAFE_CHARSET_P(coding, id)      \
 3002   ((id) <= (coding)->max_charset_id     \
 3003    && (coding)->safe_charsets[id] != 255)
 3004 
 3005 
 3006 #define SHIFT_OUT_OK(category)  \
 3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
 3008 
 3009 static void
 3010 setup_iso_safe_charsets (attrs)
 3011      Lisp_Object attrs;
 3012 {
 3013   Lisp_Object charset_list, safe_charsets;
 3014   Lisp_Object request;
 3015   Lisp_Object reg_usage;
 3016   Lisp_Object tail;
 3017   int reg94, reg96;
 3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
 3019   int max_charset_id;
 3020 
 3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
 3023       && ! EQ (charset_list, Viso_2022_charset_list))
 3024     {
 3025       CODING_ATTR_CHARSET_LIST (attrs)
 3026         = charset_list = Viso_2022_charset_list;
 3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
 3028     }
 3029 
 3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
 3031     return;
 3032 
 3033   max_charset_id = 0;
 3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
 3035     {
 3036       int id = XINT (XCAR (tail));
 3037       if (max_charset_id < id)
 3038         max_charset_id = id;
 3039     }
 3040 
 3041   safe_charsets = make_uninit_string (max_charset_id + 1);
 3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
 3043   request = AREF (attrs, coding_attr_iso_request);
 3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
 3045   reg94 = XINT (XCAR (reg_usage));
 3046   reg96 = XINT (XCDR (reg_usage));
 3047 
 3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
 3049     {
 3050       Lisp_Object id;
 3051       Lisp_Object reg;
 3052       struct charset *charset;
 3053 
 3054       id = XCAR (tail);
 3055       charset = CHARSET_FROM_ID (XINT (id));
 3056       reg = Fcdr (Fassq (id, request));
 3057       if (! NILP (reg))
 3058         SSET (safe_charsets, XINT (id), XINT (reg));
 3059       else if (charset->iso_chars_96)
 3060         {
 3061           if (reg96 < 4)
 3062             SSET (safe_charsets, XINT (id), reg96);
 3063         }
 3064       else
 3065         {
 3066           if (reg94 < 4)
 3067             SSET (safe_charsets, XINT (id), reg94);
 3068         }
 3069     }
 3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
 3071 }
 3072 
 3073 
 3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 3075    Check if a text is encoded in one of ISO-2022 based codig systems.
 3076    If it is, return 1, else return 0.  */
 3077 
 3078 static int
 3079 detect_coding_iso_2022 (coding, detect_info)
 3080      struct coding_system *coding;
 3081      struct coding_detection_info *detect_info;
 3082 {
 3083   const unsigned char *src = coding->source, *src_base = src;
 3084   const unsigned char *src_end = coding->source + coding->src_bytes;
 3085   int multibytep = coding->src_multibyte;
 3086   int single_shifting = 0;
 3087   int id;
 3088   int c, c1;
 3089   int consumed_chars = 0;
 3090   int i;
 3091   int rejected = 0;
 3092   int found = 0;
 3093   int composition_count = -1;
 3094 
 3095   detect_info->checked |= CATEGORY_MASK_ISO;
 3096 
 3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
 3098     {
 3099       struct coding_system *this = &(coding_categories[i]);
 3100       Lisp_Object attrs, val;
 3101 
 3102       if (this->id < 0)
 3103         continue;
 3104       attrs = CODING_ID_ATTRS (this->id);
 3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
 3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
 3107         setup_iso_safe_charsets (attrs);
 3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
 3109       this->max_charset_id = SCHARS (val) - 1;
 3110       this->safe_charsets = SDATA (val);
 3111     }
 3112 
 3113   /* A coding system of this category is always ASCII compatible.  */
 3114   src += coding->head_ascii;
 3115 
 3116   while (rejected != CATEGORY_MASK_ISO)
 3117     {
 3118       src_base = src;
 3119       ONE_MORE_BYTE (c);
 3120       switch (c)
 3121         {
 3122         case ISO_CODE_ESC:
 3123           if (inhibit_iso_escape_detection)
 3124             break;
 3125           single_shifting = 0;
 3126           ONE_MORE_BYTE (c);
 3127           if (c >= '(' && c <= '/')
 3128             {
 3129               /* Designation sequence for a charset of dimension 1.  */
 3130               ONE_MORE_BYTE (c1);
 3131               if (c1 < ' ' || c1 >= 0x80
 3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
 3133                 /* Invalid designation sequence.  Just ignore.  */
 3134                 break;
 3135             }
 3136           else if (c == '$')
 3137             {
 3138               /* Designation sequence for a charset of dimension 2.  */
 3139               ONE_MORE_BYTE (c);
 3140               if (c >= '@' && c <= 'B')
 3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 3142                 id = iso_charset_table[1][0][c];
 3143               else if (c >= '(' && c <= '/')
 3144                 {
 3145                   ONE_MORE_BYTE (c1);
 3146                   if (c1 < ' ' || c1 >= 0x80
 3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
 3148                     /* Invalid designation sequence.  Just ignore.  */
 3149                     break;
 3150                 }
 3151               else
 3152                 /* Invalid designation sequence.  Just ignore it.  */
 3153                 break;
 3154             }
 3155           else if (c == 'N' || c == 'O')
 3156             {
 3157               /* ESC <Fe> for SS2 or SS3.  */
 3158               single_shifting = 1;
 3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
 3160               break;
 3161             }
 3162           else if (c == '1')
 3163             {
 3164               /* End of composition.  */
 3165               if (composition_count < 0
 3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
 3167                 /* Invalid */
 3168                 break;
 3169               composition_count = -1;
 3170               found |= CATEGORY_MASK_ISO;
 3171             }
 3172           else if (c >= '0' && c <= '4')
 3173             {
 3174               /* ESC <Fp> for start/end composition.  */
 3175               composition_count = 0;
 3176               break;
 3177             }
 3178           else
 3179             {
 3180               /* Invalid escape sequence.  Just ignore it.  */
 3181               break;
 3182             }
 3183 
 3184           /* We found a valid designation sequence for CHARSET.  */
 3185           rejected |= CATEGORY_MASK_ISO_8BIT;
 3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
 3187                               id))
 3188             found |= CATEGORY_MASK_ISO_7;
 3189           else
 3190             rejected |= CATEGORY_MASK_ISO_7;
 3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
 3192                               id))
 3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
 3194           else
 3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
 3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
 3197                               id))
 3198             found |= CATEGORY_MASK_ISO_7_ELSE;
 3199           else
 3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
 3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
 3202                               id))
 3203             found |= CATEGORY_MASK_ISO_8_ELSE;
 3204           else
 3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
 3206           break;
 3207 
 3208         case ISO_CODE_SO:
 3209         case ISO_CODE_SI:
 3210           /* Locking shift out/in.  */
 3211           if (inhibit_iso_escape_detection)
 3212             break;
 3213           single_shifting = 0;
 3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
 3215           break;
 3216 
 3217         case ISO_CODE_CSI:
 3218           /* Control sequence introducer.  */
 3219           single_shifting = 0;
 3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
 3221           found |= CATEGORY_MASK_ISO_8_ELSE;
 3222           goto check_extra_latin;
 3223 
 3224         case ISO_CODE_SS2:
 3225         case ISO_CODE_SS3:
 3226           /* Single shift.   */
 3227           if (inhibit_iso_escape_detection)
 3228             break;
 3229           single_shifting = 0;
 3230           rejected |= CATEGORY_MASK_ISO_7BIT;
 3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
 3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
 3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
 3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
 3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
 3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
 3237           if (single_shifting)
 3238             break;
 3239           goto check_extra_latin;
 3240 
 3241         default:
 3242           if (c < 0)
 3243             continue;
 3244           if (c < 0x80)
 3245             {
 3246               if (composition_count >= 0)
 3247                 composition_count++;
 3248               single_shifting = 0;
 3249               break;
 3250             }
 3251           if (c >= 0xA0)
 3252             {
 3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
 3254               found |= CATEGORY_MASK_ISO_8_1;
 3255               /* Check the length of succeeding codes of the range
 3256                  0xA0..0FF.  If the byte length is even, we include
 3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
 3258                  only when we are not single shifting.  */
 3259               if (! single_shifting
 3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
 3261                 {
 3262                   int i = 1;
 3263                   while (src < src_end)
 3264                     {
 3265                       src_base = src;
 3266                       ONE_MORE_BYTE (c);
 3267                       if (c < 0xA0)
 3268                         {
 3269                           src = src_base;
 3270                           break;
 3271                         }
 3272                       i++;
 3273                     }
 3274 
 3275                   if (i & 1 && src < src_end)
 3276                     {
 3277                       rejected |= CATEGORY_MASK_ISO_8_2;
 3278                       if (composition_count >= 0)
 3279                         composition_count += i;
 3280                     }
 3281                   else
 3282                     {
 3283                       found |= CATEGORY_MASK_ISO_8_2;
 3284                       if (composition_count >= 0)
 3285                         composition_count += i / 2;
 3286                     }
 3287                 }
 3288               break;
 3289             }
 3290         check_extra_latin:
 3291           single_shifting = 0;
 3292           if (! VECTORP (Vlatin_extra_code_table)
 3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 3294             {
 3295               rejected = CATEGORY_MASK_ISO;
 3296               break;
 3297             }
 3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
 3299               & CODING_ISO_FLAG_LATIN_EXTRA)
 3300             found |= CATEGORY_MASK_ISO_8_1;
 3301           else
 3302             rejected |= CATEGORY_MASK_ISO_8_1;
 3303           rejected |= CATEGORY_MASK_ISO_8_2;
 3304         }
 3305     }
 3306   detect_info->rejected |= CATEGORY_MASK_ISO;
 3307   return 0;
 3308 
 3309  no_more_source:
 3310   detect_info->rejected |= rejected;
 3311   detect_info->found |= (found & ~rejected);
 3312   return 1;
 3313 }
 3314 
 3315 
 3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
 3317    escape sequence should be kept.  */
 3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
 3319   do {                                                                  \
 3320     int id, prev;                                                       \
 3321                                                                         \
 3322     if (final < '0' || final >= 128                                     \
 3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
 3324         || !SAFE_CHARSET_P (coding, id))                                \
 3325       {                                                                 \
 3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
 3327         chars_96 = -1;                                                  \
 3328         break;                                                          \
 3329       }                                                                 \
 3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
 3331     if (id == charset_jisx0201_roman)                                   \
 3332       {                                                                 \
 3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
 3334           id = charset_ascii;                                           \
 3335       }                                                                 \
 3336     else if (id == charset_jisx0208_1978)                               \
 3337       {                                                                 \
 3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
 3339           id = charset_jisx0208;                                        \
 3340       }                                                                 \
 3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
 3342     /* If there was an invalid designation to REG previously, and this  \
 3343        designation is ASCII to REG, we should keep this designation     \
 3344        sequence.  */                                                    \
 3345     if (prev == -2 && id == charset_ascii)                              \
 3346       chars_96 = -1;                                                    \
 3347   } while (0)
 3348 
 3349 
 3350 /* Handle these composition sequence (ALT: alternate char):
 3351 
 3352    (1) relative composition: ESC 0 CHAR ... ESC 1
 3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
 3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
 3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
 3356 
 3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
 3358    header is produced.
 3359 
 3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
 3361 
 3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
 3363    produced until the end sequence (ESC 1) is found:
 3364 
 3365    (1) CHAR ... CHAR
 3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
 3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
 3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
 3369 
 3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
 3371    annotation header is updated as below:
 3372 
 3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
 3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
 3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
 3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
 3377 
 3378    If an error is found while composing, the annotation header is
 3379    changed to:
 3380 
 3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
 3382 
 3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
 3384    byte sequence as below:
 3385         o the original byte sequence is B: [ B -1 ]
 3386         o the original byte sequence is B1 B2: [ B1 B2 ]
 3387    and the sequence [ -1 -1 ] is changed to the original byte
 3388    sequence:
 3389         [ ESC '0' ]
 3390 */
 3391 
 3392 /* Decode a composition rule C1 and maybe one more byte from the
 3393    source, and set RULE to the encoded composition rule, NBYTES to the
 3394    length of the composition rule.  If the rule is invalid, set RULE
 3395    to some negative value.  */
 3396 
 3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
 3398   do {                                                                  \
 3399     rule = c1 - 32;                                                     \
 3400     if (rule < 0)                                                       \
 3401       break;                                                            \
 3402     if (rule < 81)              /* old format (before ver.21) */        \
 3403       {                                                                 \
 3404         int gref = (rule) / 9;                                          \
 3405         int nref = (rule) % 9;                                          \
 3406         if (gref == 4) gref = 10;                                       \
 3407         if (nref == 4) nref = 10;                                       \
 3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
 3409         nbytes = 1;                                                     \
 3410       }                                                                 \
 3411     else                        /* new format (after ver.21) */         \
 3412       {                                                                 \
 3413         int c;                                                          \
 3414                                                                         \
 3415         ONE_MORE_BYTE (c);                                              \
 3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
 3417         if (rule >= 0)                                                  \
 3418           rule += 0x100;   /* to destinguish it from the old format */  \
 3419         nbytes = 2;                                                     \
 3420       }                                                                 \
 3421   } while (0)
 3422 
 3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
 3424   do {                                                          \
 3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
 3426                                                                 \
 3427     if (rule < 0x100)           /* old format */                \
 3428       {                                                         \
 3429         if (gref == 10) gref = 4;                               \
 3430         if (nref == 10) nref = 4;                               \
 3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
 3432         charbuf[idx + 1] = -1;                                  \
 3433         new_chars++;                                            \
 3434       }                                                         \
 3435     else                                /* new format */        \
 3436       {                                                         \
 3437         charbuf[idx] = 32 + 81 + gref;                          \
 3438         charbuf[idx + 1] = 32 + nref;                           \
 3439         new_chars += 2;                                         \
 3440       }                                                         \
 3441   } while (0)
 3442 
 3443 /* Finish the current composition as invalid.  */
 3444 
 3445 static int finish_composition P_ ((int *, struct composition_status *));
 3446 
 3447 static int
 3448 finish_composition (charbuf, cmp_status)
 3449      int *charbuf;
 3450      struct composition_status *cmp_status;
 3451 {
 3452   int idx = - cmp_status->length;
 3453   int new_chars;
 3454 
 3455   /* Recover the original ESC sequence */
 3456   charbuf[idx++] = ISO_CODE_ESC;
 3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
 3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
 3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
 3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
 3461                     : '4');
 3462   charbuf[idx++] = -2;
 3463   charbuf[idx++] = 0;
 3464   charbuf[idx++] = -1;
 3465   new_chars = cmp_status->nchars;
 3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
 3467     for (; idx < 0; idx++)
 3468       {
 3469         int elt = charbuf[idx];
 3470 
 3471         if (elt == -2)
 3472           {
 3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
 3474             idx++;
 3475           }
 3476         else if (elt == -1)
 3477           {
 3478             charbuf[idx++] = ISO_CODE_ESC;
 3479             charbuf[idx] = '0';
 3480             new_chars += 2;
 3481           }
 3482       }
 3483   cmp_status->state = COMPOSING_NO;
 3484   return new_chars;
 3485 }
 3486 
 3487 /* If characers are under composition, finish the composition.  */
 3488 #define MAYBE_FINISH_COMPOSITION()                              \
 3489   do {                                                          \
 3490     if (cmp_status->state != COMPOSING_NO)                      \
 3491       char_offset += finish_composition (charbuf, cmp_status);  \
 3492   } while (0)
 3493 
 3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
 3495 
 3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
 3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
 3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
 3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
 3500 
 3501    Produce this annotation sequence now:
 3502 
 3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
 3504 */
 3505 
 3506 #define DECODE_COMPOSITION_START(c1)                                       \
 3507   do {                                                                     \
 3508     if (c1 == '0'                                                          \
 3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
 3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
 3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
 3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
 3513       {                                                                    \
 3514         *charbuf++ = -1;                                                   \
 3515         *charbuf++= -1;                                                    \
 3516         cmp_status->state = COMPOSING_CHAR;                                \
 3517         cmp_status->length += 2;                                           \
 3518       }                                                                    \
 3519     else                                                                   \
 3520       {                                                                    \
 3521         MAYBE_FINISH_COMPOSITION ();                                       \
 3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
 3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
 3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
 3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
 3526         cmp_status->state                                                  \
 3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
 3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
 3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
 3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
 3531         coding->annotated = 1;                                             \
 3532       }                                                                    \
 3533   } while (0)
 3534 
 3535 
 3536 /* Handle composition end sequence ESC 1.  */
 3537 
 3538 #define DECODE_COMPOSITION_END()                                        \
 3539   do {                                                                  \
 3540     if (cmp_status->nchars == 0                                         \
 3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
 3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
 3543       {                                                                 \
 3544         MAYBE_FINISH_COMPOSITION ();                                    \
 3545         goto invalid_code;                                              \
 3546       }                                                                 \
 3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
 3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
 3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
 3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
 3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
 3552     char_offset += cmp_status->nchars;                                  \
 3553     cmp_status->state = COMPOSING_NO;                                   \
 3554   } while (0)
 3555 
 3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
 3557 
 3558 #define STORE_COMPOSITION_RULE(rule)    \
 3559   do {                                  \
 3560     *charbuf++ = -2;                    \
 3561     *charbuf++ = rule;                  \
 3562     cmp_status->length += 2;            \
 3563     cmp_status->state--;                \
 3564   } while (0)
 3565 
 3566 /* Store a composed char or a component char C in charbuf, and update
 3567    cmp_status.  */
 3568 
 3569 #define STORE_COMPOSITION_CHAR(c)                                       \
 3570   do {                                                                  \
 3571     *charbuf++ = (c);                                                   \
 3572     cmp_status->length++;                                               \
 3573     if (cmp_status->state == COMPOSING_CHAR)                            \
 3574       cmp_status->nchars++;                                             \
 3575     else                                                                \
 3576       cmp_status->ncomps++;                                             \
 3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
 3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
 3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
 3580       cmp_status->state++;                                              \
 3581   } while (0)
 3582 
 3583 
 3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 3585 
 3586 static void
 3587 decode_coding_iso_2022 (coding)
 3588      struct coding_system *coding;
 3589 {
 3590   const unsigned char *src = coding->source + coding->consumed;
 3591   const unsigned char *src_end = coding->source + coding->src_bytes;
 3592   const unsigned char *src_base;
 3593   int *charbuf = coding->charbuf + coding->charbuf_used;
 3594   /* We may produce two annocations (charset and composition) in one
 3595      loop and one more charset annocation at the end.  */
 3596   int *charbuf_end
 3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
 3598   int consumed_chars = 0, consumed_chars_base;
 3599   int multibytep = coding->src_multibyte;
 3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
 3603   int charset_id_2, charset_id_3;
 3604   struct charset *charset;
 3605   int c;
 3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
 3607   Lisp_Object attrs, charset_list;
 3608   int char_offset = coding->produced_char;
 3609   int last_offset = char_offset;
 3610   int last_id = charset_ascii;
 3611   int eol_crlf =
 3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 3613   int byte_after_cr = -1;
 3614   int i;
 3615 
 3616   CODING_GET_INFO (coding, attrs, charset_list);
 3617   setup_iso_safe_charsets (attrs);
 3618   /* Charset list may have been changed.  */
 3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
 3621 
 3622   if (cmp_status->state != COMPOSING_NO)
 3623     {
 3624       for (i = 0; i < cmp_status->length; i++)
 3625         *charbuf++ = cmp_status->carryover[i];
 3626       coding->annotated = 1;
 3627     }
 3628 
 3629   while (1)
 3630     {
 3631       int c1, c2, c3;
 3632 
 3633       src_base = src;
 3634       consumed_chars_base = consumed_chars;
 3635 
 3636       if (charbuf >= charbuf_end)
 3637         {
 3638           if (byte_after_cr >= 0)
 3639             src_base--;
 3640           break;
 3641         }
 3642 
 3643       if (byte_after_cr >= 0)
 3644         c1 = byte_after_cr, byte_after_cr = -1;
 3645       else
 3646         ONE_MORE_BYTE (c1);
 3647       if (c1 < 0)
 3648         goto invalid_code;
 3649 
 3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
 3651         {
 3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
 3653           char_offset++;
 3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
 3655           continue;
 3656         }
 3657 
 3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
 3659         {
 3660           if (c1 == ISO_CODE_ESC)
 3661             {
 3662               if (src + 1 >= src_end)
 3663                 goto no_more_source;
 3664               *charbuf++ = ISO_CODE_ESC;
 3665               char_offset++;
 3666               if (src[0] == '%' && src[1] == '@')
 3667                 {
 3668                   src += 2;
 3669                   consumed_chars += 2;
 3670                   char_offset += 2;
 3671                   /* We are sure charbuf can contain two more chars. */
 3672                   *charbuf++ = '%';
 3673                   *charbuf++ = '@';
 3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
 3675                 }
 3676             }
 3677           else
 3678             {
 3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
 3680               char_offset++;
 3681             }
 3682           continue;
 3683         }
 3684 
 3685       if ((cmp_status->state == COMPOSING_RULE
 3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
 3687           && c1 != ISO_CODE_ESC)
 3688         {
 3689           int rule, nbytes;
 3690 
 3691           DECODE_COMPOSITION_RULE (rule, nbytes);
 3692           if (rule < 0)
 3693             goto invalid_code;
 3694           STORE_COMPOSITION_RULE (rule);
 3695           continue;
 3696         }
 3697 
 3698       /* We produce at most one character.  */
 3699       switch (iso_code_class [c1])
 3700         {
 3701         case ISO_0x20_or_0x7F:
 3702           if (charset_id_0 < 0
 3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
 3704             /* This is SPACE or DEL.  */
 3705             charset = CHARSET_FROM_ID (charset_ascii);
 3706           else
 3707             charset = CHARSET_FROM_ID (charset_id_0);
 3708           break;
 3709 
 3710         case ISO_graphic_plane_0:
 3711           if (charset_id_0 < 0)
 3712             charset = CHARSET_FROM_ID (charset_ascii);
 3713           else
 3714             charset = CHARSET_FROM_ID (charset_id_0);
 3715           break;
 3716 
 3717         case ISO_0xA0_or_0xFF:
 3718           if (charset_id_1 < 0
 3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
 3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
 3721             goto invalid_code;
 3722           /* This is a graphic character, we fall down ... */
 3723 
 3724         case ISO_graphic_plane_1:
 3725           if (charset_id_1 < 0)
 3726             goto invalid_code;
 3727           charset = CHARSET_FROM_ID (charset_id_1);
 3728           break;
 3729 
 3730         case ISO_control_0:
 3731           if (eol_crlf && c1 == '\r')
 3732             ONE_MORE_BYTE (byte_after_cr);
 3733           MAYBE_FINISH_COMPOSITION ();
 3734           charset = CHARSET_FROM_ID (charset_ascii);
 3735           break;
 3736 
 3737         case ISO_control_1:
 3738           goto invalid_code;
 3739 
 3740         case ISO_shift_out:
 3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
 3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
 3743             goto invalid_code;
 3744           CODING_ISO_INVOCATION (coding, 0) = 1;
 3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3746           continue;
 3747 
 3748         case ISO_shift_in:
 3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
 3750             goto invalid_code;
 3751           CODING_ISO_INVOCATION (coding, 0) = 0;
 3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3753           continue;
 3754 
 3755         case ISO_single_shift_2_7:
 3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
 3757             goto invalid_code;
 3758         case ISO_single_shift_2:
 3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
 3760             goto invalid_code;
 3761           /* SS2 is handled as an escape sequence of ESC 'N' */
 3762           c1 = 'N';
 3763           goto label_escape_sequence;
 3764 
 3765         case ISO_single_shift_3:
 3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
 3767             goto invalid_code;
 3768           /* SS2 is handled as an escape sequence of ESC 'O' */
 3769           c1 = 'O';
 3770           goto label_escape_sequence;
 3771 
 3772         case ISO_control_sequence_introducer:
 3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
 3774           c1 = '[';
 3775           goto label_escape_sequence;
 3776 
 3777         case ISO_escape:
 3778           ONE_MORE_BYTE (c1);
 3779         label_escape_sequence:
 3780           /* Escape sequences handled here are invocation,
 3781              designation, direction specification, and character
 3782              composition specification.  */
 3783           switch (c1)
 3784             {
 3785             case '&':           /* revision of following character set */
 3786               ONE_MORE_BYTE (c1);
 3787               if (!(c1 >= '@' && c1 <= '~'))
 3788                 goto invalid_code;
 3789               ONE_MORE_BYTE (c1);
 3790               if (c1 != ISO_CODE_ESC)
 3791                 goto invalid_code;
 3792               ONE_MORE_BYTE (c1);
 3793               goto label_escape_sequence;
 3794 
 3795             case '$':           /* designation of 2-byte character set */
 3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
 3797                 goto invalid_code;
 3798               {
 3799                 int reg, chars96;
 3800 
 3801                 ONE_MORE_BYTE (c1);
 3802                 if (c1 >= '@' && c1 <= 'B')
 3803                   {     /* designation of JISX0208.1978, GB2312.1980,
 3804                            or JISX0208.1980 */
 3805                     reg = 0, chars96 = 0;
 3806                   }
 3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
 3808                   { /* designation of DIMENSION2_CHARS94 character set */
 3809                     reg = c1 - 0x28, chars96 = 0;
 3810                     ONE_MORE_BYTE (c1);
 3811                   }
 3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
 3813                   { /* designation of DIMENSION2_CHARS96 character set */
 3814                     reg = c1 - 0x2C, chars96 = 1;
 3815                     ONE_MORE_BYTE (c1);
 3816                   }
 3817                 else
 3818                   goto invalid_code;
 3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
 3820                 /* We must update these variables now.  */
 3821                 if (reg == 0)
 3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3823                 else if (reg == 1)
 3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
 3825                 if (chars96 < 0)
 3826                   goto invalid_code;
 3827               }
 3828               continue;
 3829 
 3830             case 'n':           /* invocation of locking-shift-2 */
 3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
 3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
 3833                 goto invalid_code;
 3834               CODING_ISO_INVOCATION (coding, 0) = 2;
 3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3836               continue;
 3837 
 3838             case 'o':           /* invocation of locking-shift-3 */
 3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
 3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
 3841                 goto invalid_code;
 3842               CODING_ISO_INVOCATION (coding, 0) = 3;
 3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3844               continue;
 3845 
 3846             case 'N':           /* invocation of single-shift-2 */
 3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
 3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
 3849                 goto invalid_code;
 3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
 3851               if (charset_id_2 < 0)
 3852                 charset = CHARSET_FROM_ID (charset_ascii);
 3853               else
 3854                 charset = CHARSET_FROM_ID (charset_id_2);
 3855               ONE_MORE_BYTE (c1);
 3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
 3857                 goto invalid_code;
 3858               break;
 3859 
 3860             case 'O':           /* invocation of single-shift-3 */
 3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
 3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
 3863                 goto invalid_code;
 3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
 3865               if (charset_id_3 < 0)
 3866                 charset = CHARSET_FROM_ID (charset_ascii);
 3867               else
 3868                 charset = CHARSET_FROM_ID (charset_id_3);
 3869               ONE_MORE_BYTE (c1);
 3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
 3871                 goto invalid_code;
 3872               break;
 3873 
 3874             case '0': case '2': case '3': case '4': /* start composition */
 3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
 3876                 goto invalid_code;
 3877               if (last_id != charset_ascii)
 3878                 {
 3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
 3880                   last_id = charset_ascii;
 3881                   last_offset = char_offset;
 3882                 }
 3883               DECODE_COMPOSITION_START (c1);
 3884               continue;
 3885 
 3886             case '1':           /* end composition */
 3887               if (cmp_status->state == COMPOSING_NO)
 3888                 goto invalid_code;
 3889               DECODE_COMPOSITION_END ();
 3890               continue;
 3891 
 3892             case '[':           /* specification of direction */
 3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
 3894                 goto invalid_code;
 3895               /* For the moment, nested direction is not supported.
 3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
 3897                  left-to-right, and nozero means right-to-left.  */
 3898               ONE_MORE_BYTE (c1);
 3899               switch (c1)
 3900                 {
 3901                 case ']':       /* end of the current direction */
 3902                   coding->mode &= ~CODING_MODE_DIRECTION;
 3903 
 3904                 case '0':       /* end of the current direction */
 3905                 case '1':       /* start of left-to-right direction */
 3906                   ONE_MORE_BYTE (c1);
 3907                   if (c1 == ']')
 3908                     coding->mode &= ~CODING_MODE_DIRECTION;
 3909                   else
 3910                     goto invalid_code;
 3911                   break;
 3912 
 3913                 case '2':       /* start of right-to-left direction */
 3914                   ONE_MORE_BYTE (c1);
 3915                   if (c1 == ']')
 3916                     coding->mode |= CODING_MODE_DIRECTION;
 3917                   else
 3918                     goto invalid_code;
 3919                   break;
 3920 
 3921                 default:
 3922                   goto invalid_code;
 3923                 }
 3924               continue;
 3925 
 3926             case '%':
 3927               ONE_MORE_BYTE (c1);
 3928               if (c1 == '/')
 3929                 {
 3930                   /* CTEXT extended segment:
 3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
 3932                      We keep these bytes as is for the moment.
 3933                      They may be decoded by post-read-conversion.  */
 3934                   int dim, M, L;
 3935                   int size;
 3936 
 3937                   ONE_MORE_BYTE (dim);
 3938                   if (dim < 0 || dim > 4)
 3939                     goto invalid_code;
 3940                   ONE_MORE_BYTE (M);
 3941                   if (M < 128)
 3942                     goto invalid_code;
 3943                   ONE_MORE_BYTE (L);
 3944                   if (L < 128)
 3945                     goto invalid_code;
 3946                   size = ((M - 128) * 128) + (L - 128);
 3947                   if (charbuf + 6 > charbuf_end)
 3948                     goto break_loop;
 3949                   *charbuf++ = ISO_CODE_ESC;
 3950                   *charbuf++ = '%';
 3951                   *charbuf++ = '/';
 3952                   *charbuf++ = dim;
 3953                   *charbuf++ = BYTE8_TO_CHAR (M);
 3954                   *charbuf++ = BYTE8_TO_CHAR (L);
 3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
 3956                 }
 3957               else if (c1 == 'G')
 3958                 {
 3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
 3960                      ESC % G --UTF-8-BYTES-- ESC % @
 3961                      We keep these bytes as is for the moment.
 3962                      They may be decoded by post-read-conversion.  */
 3963                   if (charbuf + 3 > charbuf_end)
 3964                     goto break_loop;
 3965                   *charbuf++ = ISO_CODE_ESC;
 3966                   *charbuf++ = '%';
 3967                   *charbuf++ = 'G';
 3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
 3969                 }
 3970               else
 3971                 goto invalid_code;
 3972               continue;
 3973               break;
 3974 
 3975             default:
 3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
 3977                 goto invalid_code;
 3978               {
 3979                 int reg, chars96;
 3980 
 3981                 if (c1 >= 0x28 && c1 <= 0x2B)
 3982                   { /* designation of DIMENSION1_CHARS94 character set */
 3983                     reg = c1 - 0x28, chars96 = 0;
 3984                     ONE_MORE_BYTE (c1);
 3985                   }
 3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
 3987                   { /* designation of DIMENSION1_CHARS96 character set */
 3988                     reg = c1 - 0x2C, chars96 = 1;
 3989                     ONE_MORE_BYTE (c1);
 3990                   }
 3991                 else
 3992                   goto invalid_code;
 3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
 3994                 /* We must update these variables now.  */
 3995                 if (reg == 0)
 3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
 3997                 else if (reg == 1)
 3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
 3999                 if (chars96 < 0)
 4000                   goto invalid_code;
 4001               }
 4002               continue;
 4003             }
 4004         }
 4005 
 4006       if (cmp_status->state == COMPOSING_NO
 4007           && charset->id != charset_ascii
 4008           && last_id != charset->id)
 4009         {
 4010           if (last_id != charset_ascii)
 4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 4012           last_id = charset->id;
 4013           last_offset = char_offset;
 4014         }
 4015 
 4016       /* Now we know CHARSET and 1st position code C1 of a character.
 4017          Produce a decoded character while getting 2nd and 3rd
 4018          position codes C2, C3 if necessary.  */
 4019       if (CHARSET_DIMENSION (charset) > 1)
 4020         {
 4021           ONE_MORE_BYTE (c2);
 4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
 4023               || ((c1 & 0x80) != (c2 & 0x80)))
 4024             /* C2 is not in a valid range.  */
 4025             goto invalid_code;
 4026           if (CHARSET_DIMENSION (charset) == 2)
 4027             c1 = (c1 << 8) | c2;
 4028           else
 4029             {
 4030               ONE_MORE_BYTE (c3);
 4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
 4032                   || ((c1 & 0x80) != (c3 & 0x80)))
 4033                 /* C3 is not in a valid range.  */
 4034                 goto invalid_code;
 4035               c1 = (c1 << 16) | (c2 << 8) | c2;
 4036             }
 4037         }
 4038       c1 &= 0x7F7F7F;
 4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
 4040       if (c < 0)
 4041         {
 4042           MAYBE_FINISH_COMPOSITION ();
 4043           for (; src_base < src; src_base++, char_offset++)
 4044             {
 4045               if (ASCII_BYTE_P (*src_base))
 4046                 *charbuf++ = *src_base;
 4047               else
 4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
 4049             }
 4050         }
 4051       else if (cmp_status->state == COMPOSING_NO)
 4052         {
 4053           *charbuf++ = c;
 4054           char_offset++;
 4055         }
 4056       else if ((cmp_status->state == COMPOSING_CHAR
 4057                 ? cmp_status->nchars
 4058                 : cmp_status->ncomps)
 4059                >= MAX_COMPOSITION_COMPONENTS)
 4060         {
 4061           /* Too long composition.  */
 4062           MAYBE_FINISH_COMPOSITION ();
 4063           *charbuf++ = c;
 4064           char_offset++;
 4065         }
 4066       else
 4067         STORE_COMPOSITION_CHAR (c);
 4068       continue;
 4069 
 4070     invalid_code:
 4071       MAYBE_FINISH_COMPOSITION ();
 4072       src = src_base;
 4073       consumed_chars = consumed_chars_base;
 4074       ONE_MORE_BYTE (c);
 4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
 4076       char_offset++;
 4077       coding->errors++;
 4078       continue;
 4079 
 4080     break_loop:
 4081       break;
 4082     }
 4083 
 4084  no_more_source:
 4085   if (cmp_status->state != COMPOSING_NO)
 4086     {
 4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
 4088         MAYBE_FINISH_COMPOSITION ();
 4089       else
 4090         {
 4091           charbuf -= cmp_status->length;
 4092           for (i = 0; i < cmp_status->length; i++)
 4093             cmp_status->carryover[i] = charbuf[i];
 4094         }
 4095     }
 4096   else if (last_id != charset_ascii)
 4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 4098   coding->consumed_char += consumed_chars_base;
 4099   coding->consumed = src_base - coding->source;
 4100   coding->charbuf_used = charbuf - coding->charbuf;
 4101 }
 4102 
 4103 
 4104 /* ISO2022 encoding stuff.  */
 4105 
 4106 /*
 4107    It is not enough to say just "ISO2022" on encoding, we have to
 4108    specify more details.  In Emacs, each coding system of ISO2022
 4109    variant has the following specifications:
 4110         1. Initial designation to G0 thru G3.
 4111         2. Allows short-form designation?
 4112         3. ASCII should be designated to G0 before control characters?
 4113         4. ASCII should be designated to G0 at end of line?
 4114         5. 7-bit environment or 8-bit environment?
 4115         6. Use locking-shift?
 4116         7. Use Single-shift?
 4117    And the following two are only for Japanese:
 4118         8. Use ASCII in place of JIS0201-1976-Roman?
 4119         9. Use JISX0208-1983 in place of JISX0208-1978?
 4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
 4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
 4122    details.
 4123 */
 4124 
 4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
 4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
 4127    '@', 'A', or 'B' and the coding system CODING allows, produce
 4128    designation sequence of short-form.  */
 4129 
 4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
 4131   do {                                                                  \
 4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
 4133     char *intermediate_char_94 = "()*+";                                \
 4134     char *intermediate_char_96 = ",-./";                                \
 4135     int revision = -1;                                                  \
 4136     int c;                                                              \
 4137                                                                         \
 4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
 4139       revision = CHARSET_ISO_REVISION (charset);                        \
 4140                                                                         \
 4141     if (revision >= 0)                                                  \
 4142       {                                                                 \
 4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
 4144         EMIT_ONE_BYTE ('@' + revision);                                 \
 4145       }                                                                 \
 4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
 4147     if (CHARSET_DIMENSION (charset) == 1)                               \
 4148       {                                                                 \
 4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
 4150           c = intermediate_char_94[reg];                                \
 4151         else                                                            \
 4152           c = intermediate_char_96[reg];                                \
 4153         EMIT_ONE_ASCII_BYTE (c);                                        \
 4154       }                                                                 \
 4155     else                                                                \
 4156       {                                                                 \
 4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
 4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
 4159           {                                                             \
 4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
 4161                 || reg != 0                                             \
 4162                 || final_char < '@' || final_char > 'B')                \
 4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
 4164           }                                                             \
 4165         else                                                            \
 4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
 4167       }                                                                 \
 4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
 4169                                                                         \
 4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
 4171   } while (0)
 4172 
 4173 
 4174 /* The following two macros produce codes (control character or escape
 4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
 4176    single-shift-3).  */
 4177 
 4178 #define ENCODE_SINGLE_SHIFT_2                                           \
 4179   do {                                                                  \
 4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
 4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
 4182     else                                                                \
 4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
 4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
 4185   } while (0)
 4186 
 4187 
 4188 #define ENCODE_SINGLE_SHIFT_3                                           \
 4189   do {                                                                  \
 4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
 4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
 4192     else                                                                \
 4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
 4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
 4195   } while (0)
 4196 
 4197 
 4198 /* The following four macros produce codes (control character or
 4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
 4200    shift-out, locking-shift-2, and locking-shift-3).  */
 4201 
 4202 #define ENCODE_SHIFT_IN                                 \
 4203   do {                                                  \
 4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
 4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
 4206   } while (0)
 4207 
 4208 
 4209 #define ENCODE_SHIFT_OUT                                \
 4210   do {                                                  \
 4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
 4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
 4213   } while (0)
 4214 
 4215 
 4216 #define ENCODE_LOCKING_SHIFT_2                          \
 4217   do {                                                  \
 4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
 4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
 4220   } while (0)
 4221 
 4222 
 4223 #define ENCODE_LOCKING_SHIFT_3                          \
 4224   do {                                                  \
 4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
 4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
 4227   } while (0)
 4228 
 4229 
 4230 /* Produce codes for a DIMENSION1 character whose character set is
 4231    CHARSET and whose position-code is C1.  Designation and invocation
 4232    sequences are also produced in advance if necessary.  */
 4233 
 4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
 4235   do {                                                                  \
 4236     int id = CHARSET_ID (charset);                                      \
 4237                                                                         \
 4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
 4239         && id == charset_ascii)                                         \
 4240       {                                                                 \
 4241         id = charset_jisx0201_roman;                                    \
 4242         charset = CHARSET_FROM_ID (id);                                 \
 4243       }                                                                 \
 4244                                                                         \
 4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
 4246       {                                                                 \
 4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
 4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
 4249         else                                                            \
 4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
 4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
 4252         break;                                                          \
 4253       }                                                                 \
 4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
 4255       {                                                                 \
 4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
 4257         break;                                                          \
 4258       }                                                                 \
 4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
 4260       {                                                                 \
 4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
 4262         break;                                                          \
 4263       }                                                                 \
 4264     else                                                                \
 4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
 4266          must invoke it, or, at first, designate it to some graphic     \
 4267          register.  Then repeat the loop to actually produce the        \
 4268          character.  */                                                 \
 4269       dst = encode_invocation_designation (charset, coding, dst,        \
 4270                                            &produced_chars);            \
 4271   } while (1)
 4272 
 4273 
 4274 /* Produce codes for a DIMENSION2 character whose character set is
 4275    CHARSET and whose position-codes are C1 and C2.  Designation and
 4276    invocation codes are also produced in advance if necessary.  */
 4277 
 4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
 4279   do {                                                                  \
 4280     int id = CHARSET_ID (charset);                                      \
 4281                                                                         \
 4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
 4283         && id == charset_jisx0208)                                      \
 4284       {                                                                 \
 4285         id = charset_jisx0208_1978;                                     \
 4286         charset = CHARSET_FROM_ID (id);                                 \
 4287       }                                                                 \
 4288                                                                         \
 4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
 4290       {                                                                 \
 4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
 4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
 4293         else                                                            \
 4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
 4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
 4296         break;                                                          \
 4297       }                                                                 \
 4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
 4299       {                                                                 \
 4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
 4301         break;                                                          \
 4302       }                                                                 \
 4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
 4304       {                                                                 \
 4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
 4306         break;                                                          \
 4307       }                                                                 \
 4308     else                                                                \
 4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
 4310          must invoke it, or, at first, designate it to some graphic     \
 4311          register.  Then repeat the loop to actually produce the        \
 4312          character.  */                                                 \
 4313       dst = encode_invocation_designation (charset, coding, dst,        \
 4314                                            &produced_chars);            \
 4315   } while (1)
 4316 
 4317 
 4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
 4319   do {                                                                     \
 4320     int code = ENCODE_CHAR ((charset),(c));                                \
 4321                                                                            \
 4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
 4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
 4324     else                                                                   \
 4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
 4326   } while (0)
 4327 
 4328 
 4329 /* Produce designation and invocation codes at a place pointed by DST
 4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
 4331    Return new DST.  */
 4332 
 4333 unsigned char *
 4334 encode_invocation_designation (charset, coding, dst, p_nchars)
 4335      struct charset *charset;
 4336      struct coding_system *coding;
 4337      unsigned char *dst;
 4338      int *p_nchars;
 4339 {
 4340   int multibytep = coding->dst_multibyte;
 4341   int produced_chars = *p_nchars;
 4342   int reg;                      /* graphic register number */
 4343   int id = CHARSET_ID (charset);
 4344 
 4345   /* At first, check designations.  */
 4346   for (reg = 0; reg < 4; reg++)
 4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
 4348       break;
 4349 
 4350   if (reg >= 4)
 4351     {
 4352       /* CHARSET is not yet designated to any graphic registers.  */
 4353       /* At first check the requested designation.  */
 4354       reg = CODING_ISO_REQUEST (coding, id);
 4355       if (reg < 0)
 4356         /* Since CHARSET requests no special designation, designate it
 4357            to graphic register 0.  */
 4358         reg = 0;
 4359 
 4360       ENCODE_DESIGNATION (charset, reg, coding);
 4361     }
 4362 
 4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
 4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
 4365     {
 4366       /* Since the graphic register REG is not invoked to any graphic
 4367          planes, invoke it to graphic plane 0.  */
 4368       switch (reg)
 4369         {
 4370         case 0:                 /* graphic register 0 */
 4371           ENCODE_SHIFT_IN;
 4372           break;
 4373 
 4374         case 1:                 /* graphic register 1 */
 4375           ENCODE_SHIFT_OUT;
 4376           break;
 4377 
 4378         case 2:                 /* graphic register 2 */
 4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
 4380             ENCODE_SINGLE_SHIFT_2;
 4381           else
 4382             ENCODE_LOCKING_SHIFT_2;
 4383           break;
 4384 
 4385         case 3:                 /* graphic register 3 */
 4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
 4387             ENCODE_SINGLE_SHIFT_3;
 4388           else
 4389             ENCODE_LOCKING_SHIFT_3;
 4390           break;
 4391         }
 4392     }
 4393 
 4394   *p_nchars = produced_chars;
 4395   return dst;
 4396 }
 4397 
 4398 /* The following three macros produce codes for indicating direction
 4399    of text.  */
 4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
 4401   do {                                                                  \
 4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
 4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
 4404     else                                                                \
 4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
 4406   } while (0)
 4407 
 4408 
 4409 #define ENCODE_DIRECTION_R2L()                  \
 4410   do {                                          \
 4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
 4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
 4413   } while (0)
 4414 
 4415 
 4416 #define ENCODE_DIRECTION_L2R()                  \
 4417   do {                                          \
 4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
 4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
 4420   } while (0)
 4421 
 4422 
 4423 /* Produce codes for designation and invocation to reset the graphic
 4424    planes and registers to initial state.  */
 4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
 4426   do {                                                                  \
 4427     int reg;                                                            \
 4428     struct charset *charset;                                            \
 4429                                                                         \
 4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
 4431       ENCODE_SHIFT_IN;                                                  \
 4432     for (reg = 0; reg < 4; reg++)                                       \
 4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
 4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
 4435               != CODING_ISO_INITIAL (coding, reg)))                     \
 4436         {                                                               \
 4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
 4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
 4439         }                                                               \
 4440   } while (0)
 4441 
 4442 
 4443 /* Produce designation sequences of charsets in the line started from
 4444    SRC to a place pointed by DST, and return updated DST.
 4445 
 4446    If the current block ends before any end-of-line, we may fail to
 4447    find all the necessary designations.  */
 4448 
 4449 static unsigned char *
 4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
 4451      struct coding_system *coding;
 4452      int *charbuf, *charbuf_end;
 4453      unsigned char *dst;
 4454 {
 4455   struct charset *charset;
 4456   /* Table of charsets to be designated to each graphic register.  */
 4457   int r[4];
 4458   int c, found = 0, reg;
 4459   int produced_chars = 0;
 4460   int multibytep = coding->dst_multibyte;
 4461   Lisp_Object attrs;
 4462   Lisp_Object charset_list;
 4463 
 4464   attrs = CODING_ID_ATTRS (coding->id);
 4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 4466   if (EQ (charset_list, Qiso_2022))
 4467     charset_list = Viso_2022_charset_list;
 4468 
 4469   for (reg = 0; reg < 4; reg++)
 4470     r[reg] = -1;
 4471 
 4472   while (found < 4)
 4473     {
 4474       int id;
 4475 
 4476       c = *charbuf++;
 4477       if (c == '\n')
 4478         break;
 4479       charset = char_charset (c, charset_list, NULL);
 4480       id = CHARSET_ID (charset);
 4481       reg = CODING_ISO_REQUEST (coding, id);
 4482       if (reg >= 0 && r[reg] < 0)
 4483         {
 4484           found++;
 4485           r[reg] = id;
 4486         }
 4487     }
 4488 
 4489   if (found)
 4490     {
 4491       for (reg = 0; reg < 4; reg++)
 4492         if (r[reg] >= 0
 4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
 4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
 4495     }
 4496 
 4497   return dst;
 4498 }
 4499 
 4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
 4501 
 4502 static int
 4503 encode_coding_iso_2022 (coding)
 4504      struct coding_system *coding;
 4505 {
 4506   int multibytep = coding->dst_multibyte;
 4507   int *charbuf = coding->charbuf;
 4508   int *charbuf_end = charbuf + coding->charbuf_used;
 4509   unsigned char *dst = coding->destination + coding->produced;
 4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 4511   int safe_room = 16;
 4512   int bol_designation
 4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
 4514        && CODING_ISO_BOL (coding));
 4515   int produced_chars = 0;
 4516   Lisp_Object attrs, eol_type, charset_list;
 4517   int ascii_compatible;
 4518   int c;
 4519   int preferred_charset_id = -1;
 4520 
 4521   CODING_GET_INFO (coding, attrs, charset_list);
 4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
 4523   if (VECTORP (eol_type))
 4524     eol_type = Qunix;
 4525 
 4526   setup_iso_safe_charsets (attrs);
 4527   /* Charset list may have been changed.  */
 4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
 4530 
 4531   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
 4532 
 4533   while (charbuf < charbuf_end)
 4534     {
 4535       ASSURE_DESTINATION (safe_room);
 4536 
 4537       if (bol_designation)
 4538         {
 4539           unsigned char *dst_prev = dst;
 4540 
 4541           /* We have to produce designation sequences if any now.  */
 4542           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
 4543           bol_designation = 0;
 4544           /* We are sure that designation sequences are all ASCII bytes.  */
 4545           produced_chars += dst - dst_prev;
 4546         }
 4547 
 4548       c = *charbuf++;
 4549 
 4550       if (c < 0)
 4551         {
 4552           /* Handle an annotation.  */
 4553           switch (*charbuf)
 4554             {
 4555             case CODING_ANNOTATE_COMPOSITION_MASK:
 4556               /* Not yet implemented.  */
 4557               break;
 4558             case CODING_ANNOTATE_CHARSET_MASK:
 4559               preferred_charset_id = charbuf[2];
 4560               if (preferred_charset_id >= 0
 4561                   && NILP (Fmemq (make_number (preferred_charset_id),
 4562                                   charset_list)))
 4563                 preferred_charset_id = -1;
 4564               break;
 4565             default:
 4566               abort ();
 4567             }
 4568           charbuf += -c - 1;
 4569           continue;
 4570         }
 4571 
 4572       /* Now encode the character C.  */
 4573       if (c < 0x20 || c == 0x7F)
 4574         {
 4575           if (c == '\n'
 4576               || (c == '\r' && EQ (eol_type, Qmac)))
 4577             {
 4578               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
 4579                 ENCODE_RESET_PLANE_AND_REGISTER ();
 4580               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
 4581                 {
 4582                   int i;
 4583 
 4584                   for (i = 0; i < 4; i++)
 4585                     CODING_ISO_DESIGNATION (coding, i)
 4586                       = CODING_ISO_INITIAL (coding, i);
 4587                 }
 4588               bol_designation
 4589                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
 4590             }
 4591           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
 4592             ENCODE_RESET_PLANE_AND_REGISTER ();
 4593           EMIT_ONE_ASCII_BYTE (c);
 4594         }
 4595       else if (ASCII_CHAR_P (c))
 4596         {
 4597           if (ascii_compatible)
 4598             EMIT_ONE_ASCII_BYTE (c);
 4599           else
 4600             {
 4601               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
 4602               ENCODE_ISO_CHARACTER (charset, c);
 4603             }
 4604         }
 4605       else if (CHAR_BYTE8_P (c))
 4606         {
 4607           c = CHAR_TO_BYTE8 (c);
 4608           EMIT_ONE_BYTE (c);
 4609         }
 4610       else
 4611         {
 4612           struct charset *charset;
 4613 
 4614           if (preferred_charset_id >= 0)
 4615             {
 4616               charset = CHARSET_FROM_ID (preferred_charset_id);
 4617               if (! CHAR_CHARSET_P (c, charset))
 4618                 charset = char_charset (c, charset_list, NULL);
 4619             }
 4620           else
 4621             charset = char_charset (c, charset_list, NULL);
 4622           if (!charset)
 4623             {
 4624               if (coding->mode & CODING_MODE_SAFE_ENCODING)
 4625                 {
 4626                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
 4627                   charset = CHARSET_FROM_ID (charset_ascii);
 4628                 }
 4629               else
 4630                 {
 4631                   c = coding->default_char;
 4632                   charset = char_charset (c, charset_list, NULL);
 4633                 }
 4634             }
 4635           ENCODE_ISO_CHARACTER (charset, c);
 4636         }
 4637     }
 4638 
 4639   if (coding->mode & CODING_MODE_LAST_BLOCK
 4640       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
 4641     {
 4642       ASSURE_DESTINATION (safe_room);
 4643       ENCODE_RESET_PLANE_AND_REGISTER ();
 4644     }
 4645   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 4646   CODING_ISO_BOL (coding) = bol_designation;
 4647   coding->produced_char += produced_chars;
 4648   coding->produced = dst - coding->destination;
 4649   return 0;
 4650 }
 4651 
 4652 
 4653 /*** 8,9. SJIS and BIG5 handlers ***/
 4654 
 4655 /* Although SJIS and BIG5 are not ISO's coding system, they are used
 4656    quite widely.  So, for the moment, Emacs supports them in the bare
 4657    C code.  But, in the future, they may be supported only by CCL.  */
 4658 
 4659 /* SJIS is a coding system encoding three character sets: ASCII, right
 4660    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
 4661    as is.  A character of charset katakana-jisx0201 is encoded by
 4662    "position-code + 0x80".  A character of charset japanese-jisx0208
 4663    is encoded in 2-byte but two position-codes are divided and shifted
 4664    so that it fit in the range below.
 4665 
 4666    --- CODE RANGE of SJIS ---
 4667    (character set)      (range)
 4668    ASCII                0x00 .. 0x7F
 4669    KATAKANA-JISX0201    0xA0 .. 0xDF
 4670    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
 4671             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
 4672    -------------------------------
 4673 
 4674 */
 4675 
 4676 /* BIG5 is a coding system encoding two character sets: ASCII and
 4677    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
 4678    character set and is encoded in two-byte.
 4679 
 4680    --- CODE RANGE of BIG5 ---
 4681    (character set)      (range)
 4682    ASCII                0x00 .. 0x7F
 4683    Big5 (1st byte)      0xA1 .. 0xFE
 4684         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
 4685    --------------------------
 4686 
 4687   */
 4688 
 4689 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 4690    Check if a text is encoded in SJIS.  If it is, return
 4691    CATEGORY_MASK_SJIS, else return 0.  */
 4692 
 4693 static int
 4694 detect_coding_sjis (coding, detect_info)
 4695      struct coding_system *coding;
 4696      struct coding_detection_info *detect_info;
 4697 {
 4698   const unsigned char *src = coding->source, *src_base;
 4699   const unsigned char *src_end = coding->source + coding->src_bytes;
 4700   int multibytep = coding->src_multibyte;
 4701   int consumed_chars = 0;
 4702   int found = 0;
 4703   int c;
 4704   Lisp_Object attrs, charset_list;
 4705   int max_first_byte_of_2_byte_code;
 4706 
 4707   CODING_GET_INFO (coding, attrs, charset_list);
 4708   max_first_byte_of_2_byte_code
 4709     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
 4710 
 4711   detect_info->checked |= CATEGORY_MASK_SJIS;
 4712   /* A coding system of this category is always ASCII compatible.  */
 4713   src += coding->head_ascii;
 4714 
 4715   while (1)
 4716     {
 4717       src_base = src;
 4718       ONE_MORE_BYTE (c);
 4719       if (c < 0x80)
 4720         continue;
 4721       if ((c >= 0x81 && c <= 0x9F)
 4722           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
 4723         {
 4724           ONE_MORE_BYTE (c);
 4725           if (c < 0x40 || c == 0x7F || c > 0xFC)
 4726             break;
 4727           found = CATEGORY_MASK_SJIS;
 4728         }
 4729       else if (c >= 0xA0 && c < 0xE0)
 4730         found = CATEGORY_MASK_SJIS;
 4731       else
 4732         break;
 4733     }
 4734   detect_info->rejected |= CATEGORY_MASK_SJIS;
 4735   return 0;
 4736 
 4737  no_more_source:
 4738   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
 4739     {
 4740       detect_info->rejected |= CATEGORY_MASK_SJIS;
 4741       return 0;
 4742     }
 4743   detect_info->found |= found;
 4744   return 1;
 4745 }
 4746 
 4747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 4748    Check if a text is encoded in BIG5.  If it is, return
 4749    CATEGORY_MASK_BIG5, else return 0.  */
 4750 
 4751 static int
 4752 detect_coding_big5 (coding, detect_info)
 4753      struct coding_system *coding;
 4754      struct coding_detection_info *detect_info;
 4755 {
 4756   const unsigned char *src = coding->source, *src_base;
 4757   const unsigned char *src_end = coding->source + coding->src_bytes;
 4758   int multibytep = coding->src_multibyte;
 4759   int consumed_chars = 0;
 4760   int found = 0;
 4761   int c;
 4762 
 4763   detect_info->checked |= CATEGORY_MASK_BIG5;
 4764   /* A coding system of this category is always ASCII compatible.  */
 4765   src += coding->head_ascii;
 4766 
 4767   while (1)
 4768     {
 4769       src_base = src;
 4770       ONE_MORE_BYTE (c);
 4771       if (c < 0x80)
 4772         continue;
 4773       if (c >= 0xA1)
 4774         {
 4775           ONE_MORE_BYTE (c);
 4776           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
 4777             return 0;
 4778           found = CATEGORY_MASK_BIG5;
 4779         }
 4780       else
 4781         break;
 4782     }
 4783   detect_info->rejected |= CATEGORY_MASK_BIG5;
 4784   return 0;
 4785 
 4786  no_more_source:
 4787   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
 4788     {
 4789       detect_info->rejected |= CATEGORY_MASK_BIG5;
 4790       return 0;
 4791     }
 4792   detect_info->found |= found;
 4793   return 1;
 4794 }
 4795 
 4796 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
 4797    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
 4798 
 4799 static void
 4800 decode_coding_sjis (coding)
 4801      struct coding_system *coding;
 4802 {
 4803   const unsigned char *src = coding->source + coding->consumed;
 4804   const unsigned char *src_end = coding->source + coding->src_bytes;
 4805   const unsigned char *src_base;
 4806   int *charbuf = coding->charbuf + coding->charbuf_used;
 4807   /* We may produce one charset annocation in one loop and one more at
 4808      the end.  */
 4809   int *charbuf_end
 4810     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
 4811   int consumed_chars = 0, consumed_chars_base;
 4812   int multibytep = coding->src_multibyte;
 4813   struct charset *charset_roman, *charset_kanji, *charset_kana;
 4814   struct charset *charset_kanji2;
 4815   Lisp_Object attrs, charset_list, val;
 4816   int char_offset = coding->produced_char;
 4817   int last_offset = char_offset;
 4818   int last_id = charset_ascii;
 4819   int eol_crlf =
 4820     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 4821   int byte_after_cr = -1;
 4822 
 4823   CODING_GET_INFO (coding, attrs, charset_list);
 4824 
 4825   val = charset_list;
 4826   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 4827   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 4828   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 4829   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
 4830 
 4831   while (1)
 4832     {
 4833       int c, c1;
 4834       struct charset *charset;
 4835 
 4836       src_base = src;
 4837       consumed_chars_base = consumed_chars;
 4838 
 4839       if (charbuf >= charbuf_end)
 4840         {
 4841           if (byte_after_cr >= 0)
 4842             src_base--;
 4843           break;
 4844         }
 4845 
 4846       if (byte_after_cr >= 0)
 4847         c = byte_after_cr, byte_after_cr = -1;
 4848       else
 4849         ONE_MORE_BYTE (c);
 4850       if (c < 0)
 4851         goto invalid_code;
 4852       if (c < 0x80)
 4853         {
 4854           if (eol_crlf && c == '\r')
 4855             ONE_MORE_BYTE (byte_after_cr);
 4856           charset = charset_roman;
 4857         }
 4858       else if (c == 0x80 || c == 0xA0)
 4859         goto invalid_code;
 4860       else if (c >= 0xA1 && c <= 0xDF)
 4861         {
 4862           /* SJIS -> JISX0201-Kana */
 4863           c &= 0x7F;
 4864           charset = charset_kana;
 4865         }
 4866       else if (c <= 0xEF)
 4867         {
 4868           /* SJIS -> JISX0208 */
 4869           ONE_MORE_BYTE (c1);
 4870           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
 4871             goto invalid_code;
 4872           c = (c << 8) | c1;
 4873           SJIS_TO_JIS (c);
 4874           charset = charset_kanji;
 4875         }
 4876       else if (c <= 0xFC && charset_kanji2)
 4877         {
 4878           /* SJIS -> JISX0213-2 */
 4879           ONE_MORE_BYTE (c1);
 4880           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
 4881             goto invalid_code;
 4882           c = (c << 8) | c1;
 4883           SJIS_TO_JIS2 (c);
 4884           charset = charset_kanji2;
 4885         }
 4886       else
 4887         goto invalid_code;
 4888       if (charset->id != charset_ascii
 4889           && last_id != charset->id)
 4890         {
 4891           if (last_id != charset_ascii)
 4892             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 4893           last_id = charset->id;
 4894           last_offset = char_offset;
 4895         }
 4896       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
 4897       *charbuf++ = c;
 4898       char_offset++;
 4899       continue;
 4900 
 4901     invalid_code:
 4902       src = src_base;
 4903       consumed_chars = consumed_chars_base;
 4904       ONE_MORE_BYTE (c);
 4905       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
 4906       char_offset++;
 4907       coding->errors++;
 4908     }
 4909 
 4910  no_more_source:
 4911   if (last_id != charset_ascii)
 4912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 4913   coding->consumed_char += consumed_chars_base;
 4914   coding->consumed = src_base - coding->source;
 4915   coding->charbuf_used = charbuf - coding->charbuf;
 4916 }
 4917 
 4918 static void
 4919 decode_coding_big5 (coding)
 4920      struct coding_system *coding;
 4921 {
 4922   const unsigned char *src = coding->source + coding->consumed;
 4923   const unsigned char *src_end = coding->source + coding->src_bytes;
 4924   const unsigned char *src_base;
 4925   int *charbuf = coding->charbuf + coding->charbuf_used;
 4926   /* We may produce one charset annocation in one loop and one more at
 4927      the end.  */
 4928   int *charbuf_end
 4929     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
 4930   int consumed_chars = 0, consumed_chars_base;
 4931   int multibytep = coding->src_multibyte;
 4932   struct charset *charset_roman, *charset_big5;
 4933   Lisp_Object attrs, charset_list, val;
 4934   int char_offset = coding->produced_char;
 4935   int last_offset = char_offset;
 4936   int last_id = charset_ascii;
 4937   int eol_crlf =
 4938     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
 4939   int byte_after_cr = -1;
 4940 
 4941   CODING_GET_INFO (coding, attrs, charset_list);
 4942   val = charset_list;
 4943   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 4944   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
 4945 
 4946   while (1)
 4947     {
 4948       int c, c1;
 4949       struct charset *charset;
 4950 
 4951       src_base = src;
 4952       consumed_chars_base = consumed_chars;
 4953 
 4954       if (charbuf >= charbuf_end)
 4955         {
 4956           if (byte_after_cr >= 0)
 4957             src_base--;
 4958           break;
 4959         }
 4960 
 4961       if (byte_after_cr >= 0)
 4962         c = byte_after_cr, byte_after_cr = -1;
 4963       else
 4964         ONE_MORE_BYTE (c);
 4965 
 4966       if (c < 0)
 4967         goto invalid_code;
 4968       if (c < 0x80)
 4969         {
 4970           if (eol_crlf && c == '\r')
 4971             ONE_MORE_BYTE (byte_after_cr);
 4972           charset = charset_roman;
 4973         }
 4974       else
 4975         {
 4976           /* BIG5 -> Big5 */
 4977           if (c < 0xA1 || c > 0xFE)
 4978             goto invalid_code;
 4979           ONE_MORE_BYTE (c1);
 4980           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
 4981             goto invalid_code;
 4982           c = c << 8 | c1;
 4983           charset = charset_big5;
 4984         }
 4985       if (charset->id != charset_ascii
 4986           && last_id != charset->id)
 4987         {
 4988           if (last_id != charset_ascii)
 4989             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 4990           last_id = charset->id;
 4991           last_offset = char_offset;
 4992         }
 4993       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
 4994       *charbuf++ = c;
 4995       char_offset++;
 4996       continue;
 4997 
 4998     invalid_code:
 4999       src = src_base;
 5000       consumed_chars = consumed_chars_base;
 5001       ONE_MORE_BYTE (c);
 5002       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
 5003       char_offset++;
 5004       coding->errors++;
 5005     }
 5006 
 5007  no_more_source:
 5008   if (last_id != charset_ascii)
 5009     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
 5010   coding->consumed_char += consumed_chars_base;
 5011   coding->consumed = src_base - coding->source;
 5012   coding->charbuf_used = charbuf - coding->charbuf;
 5013 }
 5014 
 5015 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
 5016    This function can encode charsets `ascii', `katakana-jisx0201',
 5017    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
 5018    are sure that all these charsets are registered as official charset
 5019    (i.e. do not have extended leading-codes).  Characters of other
 5020    charsets are produced without any encoding.  If SJIS_P is 1, encode
 5021    SJIS text, else encode BIG5 text.  */
 5022 
 5023 static int
 5024 encode_coding_sjis (coding)
 5025      struct coding_system *coding;
 5026 {
 5027   int multibytep = coding->dst_multibyte;
 5028   int *charbuf = coding->charbuf;
 5029   int *charbuf_end = charbuf + coding->charbuf_used;
 5030   unsigned char *dst = coding->destination + coding->produced;
 5031   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 5032   int safe_room = 4;
 5033   int produced_chars = 0;
 5034   Lisp_Object attrs, charset_list, val;
 5035   int ascii_compatible;
 5036   struct charset *charset_roman, *charset_kanji, *charset_kana;
 5037   struct charset *charset_kanji2;
 5038   int c;
 5039 
 5040   CODING_GET_INFO (coding, attrs, charset_list);
 5041   val = charset_list;
 5042   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 5043   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 5044   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 5045   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
 5046 
 5047   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
 5048 
 5049   while (charbuf < charbuf_end)
 5050     {
 5051       ASSURE_DESTINATION (safe_room);
 5052       c = *charbuf++;
 5053       /* Now encode the character C.  */
 5054       if (ASCII_CHAR_P (c) && ascii_compatible)
 5055         EMIT_ONE_ASCII_BYTE (c);
 5056       else if (CHAR_BYTE8_P (c))
 5057         {
 5058           c = CHAR_TO_BYTE8 (c);
 5059           EMIT_ONE_BYTE (c);
 5060         }
 5061       else
 5062         {
 5063           unsigned code;
 5064           struct charset *charset = char_charset (c, charset_list, &code);
 5065 
 5066           if (!charset)
 5067             {
 5068               if (coding->mode & CODING_MODE_SAFE_ENCODING)
 5069                 {
 5070                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
 5071                   charset = CHARSET_FROM_ID (charset_ascii);
 5072                 }
 5073               else
 5074                 {
 5075                   c = coding->default_char;
 5076                   charset = char_charset (c, charset_list, &code);
 5077                 }
 5078             }
 5079           if (code == CHARSET_INVALID_CODE (charset))
 5080             abort ();
 5081           if (charset == charset_kanji)
 5082             {
 5083               int c1, c2;
 5084               JIS_TO_SJIS (code);
 5085               c1 = code >> 8, c2 = code & 0xFF;
 5086               EMIT_TWO_BYTES (c1, c2);
 5087             }
 5088           else if (charset == charset_kana)
 5089             EMIT_ONE_BYTE (code | 0x80);
 5090           else if (charset_kanji2 && charset == charset_kanji2)
 5091             {
 5092               int c1, c2;
 5093 
 5094               c1 = code >> 8;
 5095               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
 5096                   || c1 == 0x28
 5097                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
 5098                 {
 5099                   JIS_TO_SJIS2 (code);
 5100                   c1 = code >> 8, c2 = code & 0xFF;
 5101                   EMIT_TWO_BYTES (c1, c2);
 5102                 }
 5103               else
 5104                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
 5105             }
 5106           else
 5107             EMIT_ONE_ASCII_BYTE (code & 0x7F);
 5108         }
 5109     }
 5110   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 5111   coding->produced_char += produced_chars;
 5112   coding->produced = dst - coding->destination;
 5113   return 0;
 5114 }
 5115 
 5116 static int
 5117 encode_coding_big5 (coding)
 5118      struct coding_system *coding;
 5119 {
 5120   int multibytep = coding->dst_multibyte;
 5121   int *charbuf = coding->charbuf;
 5122   int *charbuf_end = charbuf + coding->charbuf_used;
 5123   unsigned char *dst = coding->destination + coding->produced;
 5124   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 5125   int safe_room = 4;
 5126   int produced_chars = 0;
 5127   Lisp_Object attrs, charset_list, val;
 5128   int ascii_compatible;
 5129   struct charset *charset_roman, *charset_big5;
 5130   int c;
 5131 
 5132   CODING_GET_INFO (coding, attrs, charset_list);
 5133   val = charset_list;
 5134   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
 5135   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
 5136   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
 5137 
 5138   while (charbuf < charbuf_end)
 5139     {
 5140       ASSURE_DESTINATION (safe_room);
 5141       c = *charbuf++;
 5142       /* Now encode the character C.  */
 5143       if (ASCII_CHAR_P (c) && ascii_compatible)
 5144         EMIT_ONE_ASCII_BYTE (c);
 5145       else if (CHAR_BYTE8_P (c))
 5146         {
 5147           c = CHAR_TO_BYTE8 (c);
 5148           EMIT_ONE_BYTE (c);
 5149         }
 5150       else
 5151         {
 5152           unsigned code;
 5153           struct charset *charset = char_charset (c, charset_list, &code);
 5154 
 5155           if (! charset)
 5156             {
 5157               if (coding->mode & CODING_MODE_SAFE_ENCODING)
 5158                 {
 5159                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
 5160                   charset = CHARSET_FROM_ID (charset_ascii);
 5161                 }
 5162               else
 5163                 {
 5164                   c = coding->default_char;
 5165                   charset = char_charset (c, charset_list, &code);
 5166                 }
 5167             }
 5168           if (code == CHARSET_INVALID_CODE (charset))
 5169             abort ();
 5170           if (charset == charset_big5)
 5171             {
 5172               int c1, c2;
 5173 
 5174               c1 = code >> 8, c2 = code & 0xFF;
 5175               EMIT_TWO_BYTES (c1, c2);
 5176             }
 5177           else
 5178             EMIT_ONE_ASCII_BYTE (code & 0x7F);
 5179         }
 5180     }
 5181   record_conversion_result (coding, CODING_RESULT_SUCCESS);
 5182   coding->produced_char += produced_chars;
 5183   coding->produced = dst - coding->destination;
 5184   return 0;
 5185 }
 5186 
 5187 
 5188 /*** 10. CCL handlers ***/
 5189 
 5190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 5191    Check if a text is encoded in a coding system of which
 5192    encoder/decoder are written in CCL program.  If it is, return
 5193    CATEGORY_MASK_CCL, else return 0.  */
 5194 
 5195 static int
 5196 detect_coding_ccl (coding, detect_info)
 5197      struct coding_system *coding;
 5198      struct coding_detection_info *detect_info;
 5199 {
 5200   const unsigned char *src = coding->source, *src_base;
 5201   const unsigned char *src_end = coding->source + coding->src_bytes;
 5202   int multibytep = coding->src_multibyte;
 5203   int consumed_chars = 0;
 5204   int found = 0;
 5205   unsigned char *valids;
 5206   int head_ascii = coding->head_ascii;
 5207   Lisp_Object attrs;
 5208 
 5209   detect_info->checked |= CATEGORY_MASK_CCL;
 5210 
 5211   coding = &coding_categories[coding_category_ccl];
 5212   valids = CODING_CCL_VALIDS (coding);
 5213   attrs = CODING_ID_ATTRS (coding->id);
 5214   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 5215     src += head_ascii;
 5216 
 5217   while (1)
 5218     {
 5219       int c;
 5220 
 5221       src_base = src;
 5222       ONE_MORE_BYTE (c);
 5223       if (c < 0 || ! valids[c])
 5224         break;
 5225       if ((valids[c] > 1))
 5226         found = CATEGORY_MASK_CCL;
 5227     }
 5228   detect_info->rejected |= CATEGORY_MASK_CCL;
 5229   return 0;
 5230 
 5231  no_more_source:
 5232   detect_info->found |= found;
 5233   return 1;
 5234 }
 5235 
 5236 static void
 5237 decode_coding_ccl (coding)
 5238      struct coding_system *coding;
 5239 {
 5240   const unsigned char *src = coding->source + coding->consumed;
 5241   const unsigned char *src_end = coding->source + coding->src_bytes;
 5242   int *charbuf = coding->charbuf + coding->charbuf_used;
 5243   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 5244   int consumed_chars = 0;
 5245   int multibytep = coding->src_multibyte;
 5246   struct ccl_program *ccl = &coding->spec.ccl->ccl;
 5247   int source_charbuf[1024];
 5248   int source_byteidx[1025];
 5249   Lisp_Object attrs, charset_list;
 5250 
 5251   CODING_GET_INFO (coding, attrs, charset_list);
 5252 
 5253   while (1)
 5254     {
 5255       const unsigned char *p = src;
 5256       int i = 0;
 5257 
 5258       if (multibytep)
 5259         {
 5260           while (i < 1024 && p < src_end)
 5261             {
 5262               source_byteidx[i] = p - src;
 5263               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
 5264             }
 5265           source_byteidx[i] = p - src;
 5266         }
 5267       else
 5268         while (i < 1024 && p < src_end)
 5269           source_charbuf[i++] = *p++;
 5270 
 5271       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
 5272         ccl->last_block = 1;
 5273       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
 5274                   charset_list);
 5275       charbuf += ccl->produced;
 5276       if (multibytep)
 5277         src += source_byteidx[ccl->consumed];
 5278       else
 5279         src += ccl->consumed;
 5280       consumed_chars += ccl->consumed;
 5281       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
 5282         break;
 5283     }
 5284 
 5285   switch (ccl->status)
 5286     {
 5287     case CCL_STAT_SUSPEND_BY_SRC:
 5288       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
 5289       break;
 5290     case CCL_STAT_SUSPEND_BY_DST:
 5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
 5292       break;
 5293     case CCL_STAT_QUIT:
 5294     case CCL_STAT_INVALID_CMD:
 5295       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
 5296       break;
 5297     default:
 5298       record_conversion_result (coding, CODING_RESULT_SUCCESS);
 5299       break;
 5300     }
 5301   coding->consumed_char += consumed_chars;
 5302   coding->consumed = src - coding->source;
 5303   coding->charbuf_used = charbuf - coding->charbuf;
 5304 }
 5305 
 5306 static int
 5307 encode_coding_ccl (coding)
 5308      struct coding_system *coding;
 5309 {
 5310   struct ccl_program *ccl = &coding->spec.ccl->ccl;
 5311   int multibytep = coding->dst_multibyte;
 5312   int *charbuf = coding->charbuf;
 5313   int *charbuf_end = charbuf + coding->charbuf_used;
 5314   unsigned char *dst = coding->destination + coding->produced;
 5315   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 5316   int destination_charbuf[1024];
 5317   int i, produced_chars = 0;
 5318   Lisp_Object attrs, charset_list;
 5319 
 5320   CODING_GET_INFO (coding, attrs, charset_list);
 5321   if (coding->consumed_char == coding->src_chars
 5322       && coding->mode & CODING_MODE_LAST_BLOCK)
 5323     ccl->last_block = 1;
 5324 
 5325   while (charbuf < charbuf_end)
 5326     {
 5327       ccl_driver (ccl, charbuf, destination_charbuf,
 5328                   charbuf_end - charbuf, 1024, charset_list);
 5329       if (multibytep)
 5330         {
 5331           ASSURE_DESTINATION (ccl->produced * 2);
 5332           for (i = 0; i < ccl->produced; i++)
 5333             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
 5334         }
 5335       else
 5336         {
 5337           ASSURE_DESTINATION (ccl->produced);
 5338           for (i = 0; i < ccl->produced; i++)
 5339             *dst++ = destination_charbuf[i] & 0xFF;
 5340           produced_chars += ccl->produced;
 5341         }
 5342       charbuf += ccl->consumed;
 5343       if (ccl->status == CCL_STAT_QUIT
 5344           || ccl->status == CCL_STAT_INVALID_CMD)
 5345         break;
 5346     }
 5347 
 5348   switch (ccl->status)
 5349     {
 5350     case CCL_STAT_SUSPEND_BY_SRC:
 5351       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
 5352       break;
 5353     case CCL_STAT_SUSPEND_BY_DST:
 5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
 5355       break;
 5356     case CCL_STAT_QUIT:
 5357     case CCL_STAT_INVALID_CMD:
 5358       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
 5359       break;
 5360