From e73391db57e770a8b1d90f7cf53cf248349b7e24 Mon Sep 17 00:00:00 2001 From: openKylinBot Date: Thu, 2 Jun 2022 17:41:00 +0800 Subject: [PATCH] Import Upstream version 0.4.3 --- .travis.yml | 34 + CHANGELOG.md | 268 + CMakeLists.txt | 56 + LICENSE.md | 22 + README.md | 286 + appveyor.yml | 29 + codecov.yml | 4 + md2html/CMakeLists.txt | 15 + md2html/cmdline.c | 296 + md2html/cmdline.h | 86 + md2html/entity.c | 2190 +++++ md2html/entity.h | 42 + md2html/md2html.1 | 113 + md2html/md2html.c | 371 + md2html/render_html.c | 561 ++ md2html/render_html.h | 66 + md4c/CMakeLists.txt | 32 + md4c/md4c.c | 6309 +++++++++++++ md4c/md4c.h | 388 + md4c/md4c.pc.in | 12 + scripts/build_folding_map.py | 118 + scripts/build_punct_map.py | 66 + scripts/build_whitespace_map.py | 66 + scripts/coverity.sh | 70 + scripts/run-tests.sh | 75 + scripts/unicode/CaseFolding.txt | 1581 ++++ scripts/unicode/DerivedGeneralCategory.txt | 4045 ++++++++ test/LICENSE | 64 + test/cmark.py | 40 + test/coverage.txt | 464 + test/fuzz-input/commonmark.md | 41 + test/fuzz-input/gfm.md | 8 + test/fuzz-input/latex-math.md | 1 + test/fuzz-input/wiki.md | 1 + test/latex-math.txt | 39 + test/normalize.py | 194 + test/pathological_tests.py | 122 + test/permissive-email-autolinks.txt | 50 + test/permissive-url-autolinks.txt | 92 + test/permissive-www-autolinks.txt | 107 + test/spec.txt | 9709 ++++++++++++++++++++ test/spec_tests.py | 144 + test/strikethrough.txt | 75 + test/tables.txt | 363 + test/tasklists.txt | 117 + test/underline.txt | 39 + test/wiki-links.txt | 232 + 47 files changed, 29103 insertions(+) create mode 100644 .travis.yml create mode 100644 CHANGELOG.md create mode 100644 CMakeLists.txt create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 appveyor.yml create mode 100644 codecov.yml create mode 100644 md2html/CMakeLists.txt create mode 100644 md2html/cmdline.c create mode 100644 md2html/cmdline.h create mode 100644 md2html/entity.c create mode 100644 md2html/entity.h create mode 100644 md2html/md2html.1 create mode 100644 md2html/md2html.c create mode 100644 md2html/render_html.c create mode 100644 md2html/render_html.h create mode 100644 md4c/CMakeLists.txt create mode 100644 md4c/md4c.c create mode 100644 md4c/md4c.h create mode 100644 md4c/md4c.pc.in create mode 100644 scripts/build_folding_map.py create mode 100644 scripts/build_punct_map.py create mode 100644 scripts/build_whitespace_map.py create mode 100755 scripts/coverity.sh create mode 100755 scripts/run-tests.sh create mode 100644 scripts/unicode/CaseFolding.txt create mode 100644 scripts/unicode/DerivedGeneralCategory.txt create mode 100644 test/LICENSE create mode 100755 test/cmark.py create mode 100644 test/coverage.txt create mode 100644 test/fuzz-input/commonmark.md create mode 100644 test/fuzz-input/gfm.md create mode 100644 test/fuzz-input/latex-math.md create mode 100644 test/fuzz-input/wiki.md create mode 100644 test/latex-math.txt create mode 100755 test/normalize.py create mode 100755 test/pathological_tests.py create mode 100644 test/permissive-email-autolinks.txt create mode 100644 test/permissive-url-autolinks.txt create mode 100644 test/permissive-www-autolinks.txt create mode 100644 test/spec.txt create mode 100755 test/spec_tests.py create mode 100644 test/strikethrough.txt create mode 100644 test/tables.txt create mode 100644 test/tasklists.txt create mode 100644 test/underline.txt create mode 100644 test/wiki-links.txt diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..7a28563 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,34 @@ +# YAML definition for travis-ci.com continuous integration. +# See https://docs.travis-ci.com/user/languages/c + +language: c +dist: bionic + +compiler: + - gcc + +addons: + apt: + packages: + - python3 # for running tests + - lcov # for generating code coverage report + +before_script: + - mkdir build + - cd build + # We enforce -Wdeclaration-after-statement because Qt project needs to + # build MD4C with Integrity compiler which chokes whenever a declaration + # is not at the beginning of a block. + - CFLAGS='--coverage -g -O0 -Wall -Wdeclaration-after-statement -Werror' cmake -DCMAKE_BUILD_TYPE=Debug -G 'Unix Makefiles' .. + +script: + - make VERBOSE=1 + +after_success: + - ../scripts/run-tests.sh + # Creating report + - lcov --directory . --capture --output-file coverage.info # capture coverage info + - lcov --remove coverage.info '/usr/*' --output-file coverage.info # filter out system + - lcov --list coverage.info # debug info + # Uploading report to CodeCov + - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e828bdd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,268 @@ + +# MD4C Change Log + + +## Version 0.4.3 + +New features: + + * With `MD_FLAG_UNDERLINE`, spans enclosed in underscore (`_foo_`) are seen + as underline (`MD_SPAN_UNDERLINE`) rather then an ordinary emphasis or + strong emphasis. + +Changes: + + * The implementation of wiki-links extension (with `MD_FLAG_WIKILINKS`) has + been simplified. + + - A noticeable increase of MD4C's memory footprint introduced by the + extension implementation in 0.4.0 has been removed. + - The priority handling towards other inline elements have been unified. + (This affects an obscure case where syntax of an image was in place of + wiki-link destination made the wiki-link invalid. Now *all* inline spans + in the wiki-link destination, including the images, is suppressed.) + - The length limitation of 100 characters now always applies to wiki-link + destination. + + * Recognition of strike-through spans (with the flag `MD_FLAG_STRIKETHROUGH`) + has become much stricter and, arguably, reasonable. + + - Only single tildes (`~`) and double tildes (`~~`) are recognized as + strike-through marks. Longer ones are not anymore. + - The length of the opener and closer marks have to be the same. + - The tildes cannot open a strike-through span if a whitespace follows. + - The tildes cannot close a strike-through span if a whitespace precedes. + + This change follows the changes of behavior in cmark-gfm some time ago, so + it is also beneficial from compatibility point of view. + + * When building MD4C by hand instead of using its CMake-based build, the UTF-8 + support was by default disabled, unless explicitly asked for by defining + a preprocessor macro `MD4C_USE_UTF8`. + + This has been changed and the UTF-8 mode now becomes the default, no matter + how `md4c.c` is compiled. If you need to disable it and use the ASCII-only + mode, you have explicitly define macro `MD4C_USE_ASCII` when compiling it. + + (The CMake-based build as provided in our repository explicitly asked for + the UTF-8 support with `-DMD4C_USE_UTF8`. I.e. if you are using MD4C library + built with our vanilla `CMakeLists.txt` files, this change should not affect + you.) + +Fixes: + + * Fixed some string length handling in the special `MD4C_USE_UTF16` build. + + (This does not affect you unless you are on Windows and explicitly define + the macro when building MD4C.) + + * [#100](https://github.com/mity/md4c/issues/100): + Fixed an off-by-one error in the maximal length limit of some segments + of e-mail addresses used in autolinks. + + * [#107](https://github.com/mity/md4c/issues/107): + Fix mis-detection of asterisk-encoded emphasis in some corner cases when + length of the opener and closer differs, as in `***foo *bar baz***`. + + +## Version 0.4.2 + +Fixes: + + * [#98](https://github.com/mity/md4c/issues/98): + Fix mis-detection of asterisk-encoded emphasis in some corner cases when + length of the opener and closer differs, as in `**a *b c** d*`. + + +## Version 0.4.1 + +Unfortunately, 0.4.0 has been released with badly updated ChangeLog. Fixing +this is the only change on 0.4.1. + + +## Version 0.4.0 + +New features: + + * With `MD_FLAG_LATEXMATHSPANS`, LaTeX math spans (`$...$`) and LaTeX display + math spans (`$$...$$`) are now recognized. (Note though that the HTML + renderer outputs them verbatim in a custom `` tag.) + + Contributed by [Tilman Roeder](https://github.com/dyedgreen). + + * With `MD_FLAG_WIKILINKS`, Wiki-style links (`[[...]]`) are now recognized. + (Note though that the HTML renderer renders them as a custom `` + tag.) + + Contributed by [Nils Blomqvist](https://github.com/niblo). + +Changes: + + * Parsing of tables (with `MD_FLAG_TABLES`) is now closer to the way how + cmark-gfm parses tables as we do not require every row of the table to + contain a pipe `|` anymore. + + As a consequence, paragraphs now cannot interrupt tables. A paragraph which + follows the table has to be delimited with a blank line. + +Fixes: + + * [#94](https://github.com/mity/md4c/issues/94): + `md_build_ref_def_hashtable()`: Do not allocate more memory then strictly + needed. + + * [#95](https://github.com/mity/md4c/issues/95): + `md_is_container_mark()`: Ordered list mark requires at least one digit. + + * [#96](https://github.com/mity/md4c/issues/96): + Some fixes for link label comparison. + + +## Version 0.3.4 + +Changes: + + * Make Unicode-specific code compliant to Unicode 12.1. + + * Structure `MD_BLOCK_CODE_DETAIL` got new member `fenced_char`. Application + can use it to detect character used to form the block fences (`` ` `` or + `~`). In the case of indented code block, it is set to zero. + +Fixes: + + * [#77](https://github.com/mity/md4c/issues/77): + Fix maximal count of digits for numerical character references, as requested + by CommonMark specification 0.29. + + * [#78](https://github.com/mity/md4c/issues/78): + Fix link reference definition label matching for Unicode characters where + the folding mapping leads to multiple codepoints, as e.g. in `ẞ` -> `SS`. + + * [#83](https://github.com/mity/md4c/issues/83): + Fix recognition of an empty blockquote which interrupts a paragraph. + + +## Version 0.3.3 + +Changes: + + * Make permissive URL autolink and permissive WWW autolink extensions stricter. + + This brings the behavior closer to GFM and mitigates risk of false positives. + In particular, the domain has to contain at least one dot and parenthesis + can be part of the link destination only if `(` and `)` are balanced. + +Fixes: + + * [#73](https://github.com/mity/md4c/issues/73): + Some raw HTML inputs could lead to quadratic parsing times. + + * [#74](https://github.com/mity/md4c/issues/74): + Fix input leading to a crash. Found by fuzzing. + + * [#76](https://github.com/mity/md4c/issues/76): + Fix handling of parenthesis in some corner cases of permissive URL autolink + and permissive WWW autolink extensions. + + +## Version 0.3.2 + +Changes: + + * Changes mandated by CommonMark specification 0.29. + + Most importantly, the white-space trimming rules for code spans have changed. + At most one space/newline is trimmed from beginning/end of the code span + (if the code span contains some non-space contents, and if it begins and + ends with space at the same time). In all other cases the spaces in the code + span are now left intact. + + Other changes in behavior are in corner cases only. Refer to [CommonMark + 0.29 notes](https://github.com/commonmark/commonmark-spec/releases/tag/0.29) + for more info. + +Fixes: + + * [#68](https://github.com/mity/md4c/issues/68): + Some specific HTML blocks were not recognized when EOF follows without any + end-of-line character. + + * [#69](https://github.com/mity/md4c/issues/69): + Strike-through span not working correctly when its opener mark is directly + followed by other opener mark; or when other closer mark directly precedes + its closer mark. + + +## Version 0.3.1 + +Fixes: + + * [#58](https://github.com/mity/md4c/issues/58), + [#59](https://github.com/mity/md4c/issues/59), + [#60](https://github.com/mity/md4c/issues/60), + [#63](https://github.com/mity/md4c/issues/63), + [#66](https://github.com/mity/md4c/issues/66): + Some inputs could lead to quadratic parsing times. Thanks to Anders Kaseorg + for finding all those issues. + + * [#61](https://github.com/mity/md4c/issues/59): + Flag `MD_FLAG_NOHTMLSPANS` erroneously affected also recognition of + CommonMark autolinks. + + +## Version 0.3.0 + +New features: + + * Add extension for GitHub-style task lists: + + ``` + * [x] foo + * [x] bar + * [ ] baz + ``` + + (It has to be explicitly enabled with `MD_FLAG_TASKLISTS`.) + + * Added support for building as a shared library. On non-Windows platforms, + this is now default behavior; on Windows static library is still the default. + The CMake option `BUILD_SHARED_LIBS` can be used to request one or the other + explicitly. + + Contributed by Lisandro Damián Nicanor Pérez Meyer. + + * Renamed structure `MD_RENDERER` to `MD_PARSER` and refactorize its contents + a little bit. Note this is source-level incompatible and initialization code + in apps may need to be updated. + + The aim of the change is to be more friendly for long-term ABI compatibility + we shall maintain, starting with this release. + + * Added `CHANGELOG.md` (this file). + + * Make sure `md_process_table_row()` reports the same count of table cells for + all table rows, no matter how broken the input is. The cell count is derived + from table underline line. Bogus cells in other rows are silently ignored. + Missing cells in other rows are reported as empty ones. + +Fixes: + + * CID 1475544: + Calling `md_free_attribute()` on uninitialized data. + + * [#47](https://github.com/mity/md4c/issues/47): + Using bad offsets in `md_is_entity_str()`, in some cases leading to buffer + overflow. + + * [#51](https://github.com/mity/md4c/issues/51): + Segfault in `md_process_table_cell()`. + + * [#53](https://github.com/mity/md4c/issues/53): + With `MD_FLAG_PERMISSIVEURLAUTOLINKS` or `MD_FLAG_PERMISSIVEWWWAUTOLINKS` + we could generate bad output for ordinary Markdown links, if a non-space + character immediately follows like e.g. in `[link](http://github.com)X`. + + +## Version 0.2.7 + +This was the last version before the changelog has been added. diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e3de012 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,56 @@ + +cmake_minimum_required(VERSION 3.4) +project(MD4C C) + +set(MD_VERSION_MAJOR 0) +set(MD_VERSION_MINOR 4) +set(MD_VERSION_RELEASE 3) +set(MD_VERSION "${MD_VERSION_MAJOR}.${MD_VERSION_MINOR}.${MD_VERSION_RELEASE}") + +if(WIN32) + # On Windows, given there is no standard lib install dir etc., we rather + # by default build static lib. + option(BUILD_SHARED_LIBS "help string describing option" OFF) +else() + # On Linux, MD4C is slowly being adding into some distros which prefer + # shared lib. + option(BUILD_SHARED_LIBS "help string describing option" ON) +endif() + +add_definitions( + -DMD_VERSION_MAJOR=${MD_VERSION_MAJOR} + -DMD_VERSION_MINOR=${MD_VERSION_MINOR} + -DMD_VERSION_RELEASE=${MD_VERSION_RELEASE} +) + +set(CMAKE_CONFIGURATION_TYPES Debug Release RelWithDebInfo MinSizeRel) +if("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE}) + + if("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE "Release") + endif() +endif() + + +if(${CMAKE_C_COMPILER_ID} MATCHES GNU|Clang) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") +elseif(MSVC) + # Disable warnings about the so-called unsecured functions: + add_definitions(/D_CRT_SECURE_NO_WARNINGS) + + # Specify proper C runtime library: + string(REGEX REPLACE "/M[DT]d?" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REGEX REPLACE "/M[DT]d?" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") + string(REGEX REPLACE "/M[DT]d?" "" CMAKE_C_FLAGS_RELWITHDEBINFO "{$CMAKE_C_FLAGS_RELWITHDEBINFO}") + string(REGEX REPLACE "/M[DT]d?" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE} /MT") + set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_RELEASE} /MT") +endif() + +include(GNUInstallDirs) + +add_subdirectory(md4c) +add_subdirectory(md2html) diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2088ba4 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,22 @@ + +# The MIT License (MIT) + +Copyright © 2016-2020 Martin Mitáš + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the “Software”), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4804281 --- /dev/null +++ b/README.md @@ -0,0 +1,286 @@ +[![Linux Build Status (travis-ci.com)](https://img.shields.io/travis/mity/md4c/master.svg?logo=linux&label=linux%20build)](https://travis-ci.org/mity/md4c) +[![Windows Build Status (appveyor.com)](https://img.shields.io/appveyor/ci/mity/md4c/master.svg?logo=windows&label=windows%20build)](https://ci.appveyor.com/project/mity/md4c/branch/master) +[![Code Coverage Status (codecov.io)](https://img.shields.io/codecov/c/github/mity/md4c/master.svg?logo=codecov&label=code%20coverage)](https://codecov.io/github/mity/md4c) +[![Coverity Scan Status](https://img.shields.io/coverity/scan/mity-md4c.svg?label=coverity%20scan)](https://scan.coverity.com/projects/mity-md4c) + + +# MD4C Readme + +* Home: http://github.com/mity/md4c +* Wiki: http://github.com/mity/md4c/wiki +* Issue tracker: http://github.com/mity/md4c/issues + +MD4C stands for "Markdown for C" and that's exactly what this project is about. + + +## What is Markdown + +In short, Markdown is the markup language this `README.md` file is written in. + +The following resources can explain more if you are unfamiliar with it: +* [Wikipedia article](http://en.wikipedia.org/wiki/Markdown) +* [CommonMark site](http://commonmark.org) + + +## What is MD4C + +MD4C is C Markdown parser with the following features: + +* **Compliance:** Generally MD4C aims to be compliant to the latest version of + [CommonMark specification](http://spec.commonmark.org/). Currently, we are + fully compliant to CommonMark 0.29. + +* **Extensions:** MD4C supports some commonly requested and accepted extensions. + See below. + +* **Compactness:** MD4C is implemented in one source file and one header file. + There are no dependencies other then standard C library. + +* **Embedding:** MD4C is easy to reuse in other projects, its API is very + straightforward: There is actually just one function, `md_parse()`. + +* **Push model:** MD4C parses the complete document and calls few callback + functions provided by the application to inform it about a start/end of + every block, a start/end of every span, and with any textual contents. + +* **Portability:** MD4C builds and works on Windows and POSIX-compliant OSes. + (It should be simple to make it run also on most other platforms, at least as + long as the platform provides C standard library, including a heap memory + management.) + +* **Encoding:** MD4C can be compiled to recognize ASCII-only control characters, + UTF-8 and, on Windows, also UTF-16 (i.e. what is on Windows commonly called + just "Unicode"). See more details below. + +* **Permissive license:** MD4C is available under the MIT license. + +* **Performance:** MD4C is [very fast](https://talk.commonmark.org/t/2520). + + +## Using MD4C + +Application has to include the header `md4c.h` and link against MD4C library; +or alternatively it may include `md4c.h` and `md4c.c` directly into its source +base as the parser is only implemented in the single C source file. + +The main provided function is `md_parse()`. It takes a text in the Markdown +syntax and a pointer to a structure which provides pointers to several callback +functions. + +As `md_parse()` processes the input, it calls the callbacks (when entering or +leaving any Markdown block or span; and when outputting any textual content of +the document), allowing application to convert it into another format or render +it onto the screen. + +An example implementation of simple renderer is available in the `md2html` +directory which implements a conversion utility from Markdown to HTML. + + +## Markdown Extensions + +The default behavior is to recognize only Markdown syntax defined by the +[CommonMark specification](http://spec.commonmark.org/). + +However with appropriate flags, the behavior can be tuned to enable some +additional extensions: + +* With the flag `MD_FLAG_COLLAPSEWHITESPACE`, a non-trivial whitespace is + collapsed into a single space. + +* With the flag `MD_FLAG_TABLES`, GitHub-style tables are supported. + +* With the flag `MD_FLAG_TASKLISTS`, GitHub-style task lists are supported. + +* With the flag `MD_FLAG_STRIKETHROUGH`, strike-through spans are enabled + (text enclosed in tilde marks, e.g. `~foo bar~`). + +* With the flag `MD_FLAG_PERMISSIVEURLAUTOLINKS` permissive URL autolinks + (not enclosed in `<` and `>`) are supported. + +* With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, permissive e-mail + autolinks (not enclosed in `<` and `>`) are supported. + +* With the flag `MD_FLAG_PERMISSIVEWWWAUTOLINKS` permissive WWW autolinks + without any scheme specified (e.g. `www.example.com`) are supported. MD4C + then assumes `http:` scheme. + +* With the flag `MD_FLAG_LATEXMATHSPANS` LaTeX math spans (`$...$`) and + LaTeX display math spans (`$$...$$`) are supported. (Note though that the + HTML renderer outputs them verbatim in a custom tag ``.) + +* With the flag `MD_FLAG_WIKILINKS`, wiki-style links (`[[link label]]` and + `[[target article|link label]]`) are supported. (Note that the HTML renderer + outputs them in a custom tag ``.) + +* With the flag `MD_FLAG_UNDERLINE`, underscore (`_`) denotes an underline + instead of an ordinary emphasis or strong emphasis. + +Few features of CommonMark (those some people see as mis-features) may be +disabled: + +* With the flag `MD_FLAG_NOHTMLSPANS` or `MD_FLAG_NOHTMLBLOCKS`, raw inline + HTML or raw HTML blocks respectively are disabled. + +* With the flag `MD_FLAG_NOINDENTEDCODEBLOCKS`, indented code blocks are + disabled. + + +## Input/Output Encoding + +The CommonMark specification generally assumes UTF-8 input, but under closer +inspection, Unicode plays any role in few very specific situations when parsing +Markdown documents: + +1. For detection of word boundaries when processing emphasis and strong + emphasis, some classification of Unicode characters (whether it is + a whitespace or a punctuation) is needed. + +2. For (case-insensitive) matching of a link reference label with the + corresponding link reference definition, Unicode case folding is used. + +3. For translating HTML entities (e.g. `&`) and numeric character + references (e.g. `#` or `ಫ`) into their Unicode equivalents. + + However MD4C leaves this translation on the renderer/application; as the + renderer is supposed to really know output encoding and whether it really + needs to perform this kind of translation. (For example, when the renderer + outputs HTML, it may leave the entities untranslated and defer the work to + a web browser.) + +MD4C relies on this property of the CommonMark and the implementation is, to +a large degree, encoding-agnostic. Most of MD4C code only assumes that the +encoding of your choice is compatible with ASCII, i.e. that the codepoints +below 128 have the same numeric values as ASCII. + +Any input MD4C does not understand is simply seen as part of the document text +and sent to the renderer's callback functions unchanged. + +The two situations (word boundary detection and link reference matching) where +MD4C has to understand Unicode are handled as specified by the following rules: + +* If preprocessor macro `MD4C_USE_UTF8` is defined, MD4C assumes UTF-8 for the + word boundary detection and for the case-insensitive matching of link labels. + + When none of these macros is explicitly used, this is the default behavior. + +* On Windows, if preprocessor macro `MD4C_USE_UTF16` is defined, MD4C uses + `WCHAR` instead of `char` and assumes UTF-16 encoding in those situations. + (UTF-16 is what Windows developers usually call just "Unicode" and what + Win32API generally works with.) + + Note that because this macro affects also the types in `md4c.h`, you have + to define the macro both when building MD4C as well as when including + `md4c.h`. + + Also note this is only supported in the parser (`md4c.[hc]`). The HTML + renderer does not support this and you will have to write your own custom + renderer to use this feature. + +* If preprocessor macro `MD4C_USE_ASCII` is defined, MD4C assumes nothing but + an ASCII input. + + That effectively means that non-ASCII whitespace or punctuation characters + won't be recognized as such and that link reference matching will work in + a case-insensitive way only for ASCII letters (`[a-zA-Z]`). + + +## Documentation + +The API is quite well documented in the comments in the `md4c.h` header. + +There is also [project wiki](http://github.com/mity/md4c/wiki) which provides +some more comprehensive documentation. However note it is incomplete and some +details may be little-bit outdated. + + +## FAQ + +**Q: In my code, I need to convert Markdown to HTML. How?** + +**A:** Indeed the API, as provided by `md4c.h`, is just a SAX-like Markdown +parser. Nothing more and nothing less. + +That said, there is a complete HTML generator built on top of the parser in the +directory `md2html` (the files `render_html.[hc]` and `entity.[hc]`). At this +time, you have to directly reuse that code in your project. + +There is [some discussion](https://github.com/mity/md4c/issues/82) whether this +should be changed (and how) in the future. + +**Q: How does MD4C compare to a parser XY?** + +**A:** Some other implementations combine Markdown parser and HTML generator +into a single entangled code hidden behind an interface which just allows the +conversion from Markdown to HTML, and they are unusable if you want to process +the input in any other way. + +Even when the parsing is available as a standalone feature, most parsers (if +not all of them; at least within the scope of C/C++ language) are full DOM-like +parsers: They construct abstract syntax tree (AST) representation of the whole +Markdown document. That takes time and it leads to bigger memory footprint. + +It's completely fine as long as you really need it. If you don't need the full +AST, there is very high chance that using MD4C will be faster and much less +memory-hungry. + +Last but not least, some Markdown parsers are implemented in a naive way. When +fed with a [smartly crafted input pattern](test/pathological_tests.py), they +may exhibit quadratic (or even worse) parsing times. What MD4C can still parse +in a fraction of second may turn into long minutes or possibly hours with them. +Hence, when such a naive parser is used to process an input from an untrusted +source, the possibility of denial-of-service attacks becomes a real danger. + +A lot of our effort went into providing linear parsing times no matter what +kind of crazy input MD4C parser is fed with. (If you encounter an input pattern +which leads to a sub-linear parsing times, please do not hesitate and report it +as a bug.) + +**Q: Does MD4C perform any input validation?** + +**A:** No. + +CommonMark specification declares that any sequence of (Unicode) characters is +a valid Markdown document; i.e. that it does not matter whether some Markdown +syntax is in some way broken or not. If it is broken, it will simply not be +recognized and the parser should see the broken syntax construction just as a +verbatim text. + +MD4C takes this a step further. It sees any sequence of bytes as a valid input, +following completely the GIGO philosophy (garbage in, garbage out). + +If you need to validate that the input is, say, a valid UTF-8 document, you +have to do it on your own. You can simply validate the whole Markdown document +before passing it to the MD4C parser. + +Alternatively, you may perform the validation on the fly during the parsing, +in the `MD_PARSER::text()` callback. (Given how MD4C works internally, it will +never break a sequence of bytes into multiple calls of `MD_PARSER::text()`, +unless that sequence is already broken to multiple pieces in the input by some +whitespace, new line character(s) and/or any Markdown syntax construction.) + + +## License + +MD4C is covered with MIT license, see the file `LICENSE.md`. + + +## Links to Related Projects + +Ports and bindings to other languages: + +* [commonmark-d](https://github.com/AuburnSounds/commonmark-d): + Port of MD4C to D language. + +* [markdown-wasm](https://github.com/rsms/markdown-wasm): + Markdown parser and HTML generator for WebAssembly, based on MD4C. + +Software using MD4C: + +* [Qt](https://www.qt.io/): + Cross-platform C++ GUI framework. + +* [Textosaurus](https://github.com/martinrotter/textosaurus): + Cross-platform text editor based on Qt and Scintilla. + +* [8th](https://8th-dev.com/): + Cross-platform concatenative programming language. diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..d4bcaff --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,29 @@ +# YAML definition for Appveyor.com continuous integration. +# See http://www.appveyor.com/docs/appveyor-yml + +version: '{branch}-{build}' + +before_build: + - 'cmake --version' + - 'if "%PLATFORM%"=="x64" cmake -G "Visual Studio 12 Win64" .' + - 'if not "%PLATFORM%"=="x64" cmake -G "Visual Studio 12" .' + +build: + project: md4c.sln + verbosity: detailed + +skip_tags: true + +os: + - Windows Server 2012 R2 + +configuration: + - Debug + - Release + +platform: + - x64 # 64-bit build + - win32 # 32-bit build + +artifacts: + - path: $(configuration)/md2html/md2html.exe diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..9cb19ea --- /dev/null +++ b/codecov.yml @@ -0,0 +1,4 @@ +# YAML definition for codecov.io code coverage reports. + +ignore: + - "md2html" diff --git a/md2html/CMakeLists.txt b/md2html/CMakeLists.txt new file mode 100644 index 0000000..2988b9c --- /dev/null +++ b/md2html/CMakeLists.txt @@ -0,0 +1,15 @@ + +include_directories("${PROJECT_SOURCE_DIR}/md4c") + +add_executable(md2html cmdline.c cmdline.h entity.c entity.h md2html.c render_html.c render_html.h) +target_link_libraries(md2html md4c) + +install( + TARGETS md2html + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +install(FILES "md2html.1" DESTINATION "${CMAKE_INSTALL_MANDIR}/man1") diff --git a/md2html/cmdline.c b/md2html/cmdline.c new file mode 100644 index 0000000..af482ae --- /dev/null +++ b/md2html/cmdline.c @@ -0,0 +1,296 @@ +/* cmdline.c: a reentrant version of getopt(). Written 2006 by Brian + * Raiter. This code is in the public domain. + */ + +#include +#include +#include +#include +#include "cmdline.h" + +#define docallback(opt, val) \ + do { if ((r = callback(opt, val, data)) != 0) return r; } while (0) + +/* Parse the given cmdline arguments. + */ +int readoptions(option const* list, int argc, char **argv, + int (*callback)(int, char const*, void*), void *data) +{ + char argstring[] = "--"; + option const *opt; + char const *val; + char const *p; + int stop = 0; + int argi, len, r; + + if (!list || !callback) + return -1; + + for (argi = 1 ; argi < argc ; ++argi) + { + /* First, check for "--", which forces all remaining arguments + * to be treated as non-options. + */ + if (!stop && argv[argi][0] == '-' && argv[argi][1] == '-' + && argv[argi][2] == '\0') { + stop = 1; + continue; + } + + /* Arguments that do not begin with '-' (or are only "-") are + * not options. + */ + if (stop || argv[argi][0] != '-' || argv[argi][1] == '\0') { + docallback(0, argv[argi]); + continue; + } + + if (argv[argi][1] == '-') + { + /* Arguments that begin with a double-dash are long + * options. + */ + p = argv[argi] + 2; + val = strchr(p, '='); + if (val) + len = val++ - p; + else + len = strlen(p); + + /* Is it on the list of valid options? If so, does it + * expect a parameter? + */ + for (opt = list ; opt->optval ; ++opt) + if (opt->name && !strncmp(p, opt->name, len) + && !opt->name[len]) + break; + if (!opt->optval) { + docallback('?', argv[argi]); + } else if (!val && opt->arg == 1) { + docallback(':', argv[argi]); + } else if (val && opt->arg == 0) { + docallback('=', argv[argi]); + } else { + docallback(opt->optval, val); + } + } + else + { + /* Arguments that begin with a single dash contain one or + * more short options. Each character in the argument is + * examined in turn, unless a parameter consumes the rest + * of the argument (or possibly even the following + * argument). + */ + for (p = argv[argi] + 1 ; *p ; ++p) { + for (opt = list ; opt->optval ; ++opt) + if (opt->chname == *p) + break; + if (!opt->optval) { + argstring[1] = *p; + docallback('?', argstring); + continue; + } else if (opt->arg == 0) { + docallback(opt->optval, NULL); + continue; + } else if (p[1]) { + docallback(opt->optval, p + 1); + break; + } else if (argi + 1 < argc && strcmp(argv[argi + 1], "--")) { + ++argi; + docallback(opt->optval, argv[argi]); + break; + } else if (opt->arg == 2) { + docallback(opt->optval, NULL); + continue; + } else { + argstring[1] = *p; + docallback(':', argstring); + break; + } + } + } + } + return 0; +} + +/* Verify that str points to an ASCII zero or one (optionally with + * whitespace) and return the value present, or -1 if str's contents + * are anything else. + */ +static int readboolvalue(char const *str) +{ + char d; + + while (isspace(*str)) + ++str; + if (!*str) + return -1; + d = *str++; + while (isspace(*str)) + ++str; + if (*str) + return -1; + if (d == '0') + return 0; + else if (d == '1') + return 1; + else + return -1; +} + +/* Parse a configuration file. + */ +int readcfgfile(option const* list, FILE *fp, + int (*callback)(int, char const*, void*), void *data) +{ + char buf[1024]; + option const *opt; + char *name, *val, *p; + int len, f, r; + + while (fgets(buf, sizeof buf, fp) != NULL) + { + /* Strip off the trailing newline and any leading whitespace. + * If the line begins with a hash sign, skip it entirely. + */ + len = strlen(buf); + if (len && buf[len - 1] == '\n') + buf[--len] = '\0'; + for (p = buf ; isspace(*p) ; ++p) ; + if (!*p || *p == '#') + continue; + + /* Find the end of the option's name and the beginning of the + * parameter, if any. + */ + for (name = p ; *p && *p != '=' && !isspace(*p) ; ++p) ; + len = p - name; + for ( ; *p == '=' || isspace(*p) ; ++p) ; + val = p; + + /* Is it on the list of valid options? Does it take a + * full parameter, or just an optional boolean? + */ + for (opt = list ; opt->optval ; ++opt) + if (opt->name && !strncmp(name, opt->name, len) + && !opt->name[len]) + break; + if (!opt->optval) { + docallback('?', name); + } else if (!*val && opt->arg == 1) { + docallback(':', name); + } else if (*val && opt->arg == 0) { + f = readboolvalue(val); + if (f < 0) + docallback('=', name); + else if (f == 1) + docallback(opt->optval, NULL); + } else { + docallback(opt->optval, val); + } + } + return ferror(fp) ? -1 : 0; +} + +/* Turn a string containing a cmdline into an argc-argv pair. + */ +int makecmdline(char const *cmdline, int *argcp, char ***argvp) +{ + char **argv; + int argc; + char const *s; + int n, quoted; + + if (!cmdline) + return 0; + + /* Calcuate argc by counting the number of "clumps" of non-spaces. + */ + for (s = cmdline ; isspace(*s) ; ++s) ; + if (!*s) { + *argcp = 1; + if (argvp) { + *argvp = malloc(2 * sizeof(char*)); + if (!*argvp) + return 0; + (*argvp)[0] = NULL; + (*argvp)[1] = NULL; + } + return 1; + } + for (argc = 2, quoted = 0 ; *s ; ++s) { + if (quoted == '"') { + if (*s == '"') + quoted = 0; + else if (*s == '\\' && s[1]) + ++s; + } else if (quoted == '\'') { + if (*s == '\'') + quoted = 0; + } else { + if (isspace(*s)) { + for ( ; isspace(s[1]) ; ++s) ; + if (!s[1]) + break; + ++argc; + } else if (*s == '"' || *s == '\'') { + quoted = *s; + } + } + } + + *argcp = argc; + if (!argvp) + return 1; + + /* Allocate space for all the arguments and their pointers. + */ + argv = malloc((argc + 1) * sizeof(char*) + strlen(cmdline) + 1); + *argvp = argv; + if (!argv) + return 0; + argv[0] = NULL; + argv[1] = (char*)(argv + argc + 1); + + /* Copy the string into the allocated memory immediately after the + * argv array. Where spaces immediately follows a nonspace, + * replace it with a \0. Where a nonspace immediately follows + * spaces, store a pointer to it. (Except, of course, when the + * space-nonspace transitions occur within quotes.) + */ + for (s = cmdline ; isspace(*s) ; ++s) ; + for (argc = 1, n = 0, quoted = 0 ; *s ; ++s) { + if (quoted == '"') { + if (*s == '"') { + quoted = 0; + } else { + if (*s == '\\' && s[1]) + ++s; + argv[argc][n++] = *s; + } + } else if (quoted == '\'') { + if (*s == '\'') + quoted = 0; + else + argv[argc][n++] = *s; + } else { + if (isspace(*s)) { + argv[argc][n] = '\0'; + for ( ; isspace(s[1]) ; ++s) ; + if (!s[1]) + break; + argv[argc + 1] = argv[argc] + n + 1; + ++argc; + n = 0; + } else { + if (*s == '"' || *s == '\'') + quoted = *s; + else + argv[argc][n++] = *s; + } + } + } + argv[argc + 1] = NULL; + return 1; +} diff --git a/md2html/cmdline.h b/md2html/cmdline.h new file mode 100644 index 0000000..66c97a9 --- /dev/null +++ b/md2html/cmdline.h @@ -0,0 +1,86 @@ +/* cmdline.h: a reentrant version of getopt(). Written 2006 by Brian + * Raiter. This code is in the public domain. + */ + +#ifndef _cmdline_h_ +#define _cmdline_h_ + +/* The information specifying a single cmdline option. + */ +typedef struct option { + char const *name; /* the option's long name, or "" if none */ + char chname; /* a single-char name, or zero if none */ + int optval; /* a unique value representing this option */ + int arg; /* 0 = no arg, 1 = arg req'd, 2 = optional */ +} option; + +/* Parse the given cmdline arguments. list is an array of option + * structs, each entry specifying a valid option. The last struct in + * the array must have name set to NULL. argc and argv give the + * cmdline to parse. callback is the function to call for each option + * and non-option found on the cmdline. data is a pointer that is + * passed to each invocation of callback. The return value of callback + * should be zero to continue processing the cmdline, or any other + * value to abort. The return value of readoptions() is the value + * returned from the last callback, or zero if no arguments were + * found, or -1 if an error occurred. + * + * When readoptions() encounters a regular cmdline argument (i.e. a + * non-option argument), callback() is invoked with opt equal to zero + * and val pointing to the argument. When an option is found, + * callback() is invoked with opt equal to the optval field in the + * option struct corresponding to that option, and val points to the + * option's paramter, or is NULL if the option does not take a + * parameter. If readoptions() finds an option that does not appear in + * the list of valid options, callback() is invoked with opt equal to + * '?'. If readoptions() encounters an option that is missing its + * required parameter, callback() is invoked with opt equal to ':'. If + * readoptions() finds a parameter on a long option that does not + * admit a parameter, callback() is invoked with opt equal to '='. In + * each of these cases, val will point to the erroneous option + * argument. + */ +extern int readoptions(option const* list, int argc, char **argv, + int (*callback)(int opt, char const *val, void *data), + void *data); + +/* Parse the given file. list is an array of option structs, in the + * same form as taken by readoptions(). fp is a pointer to an open + * text file. callback is the function to call for each line found in + * the configuration file. data is a pointer that is passed to each + * invocation of callback. The return value of readcfgfile() is the + * value returned from the last callback, or zero if no arguments were + * found, or -1 if an error occurred while reading the file. + * + * The function will ignore lines that contain only whitespace, or + * lines that begin with a hash sign. All other lines should be of the + * form "OPTION=VALUE", where OPTION is one of the long options in + * list. Whitespace around the equal sign is permitted. An option that + * takes no arguments can either have a VALUE of 0 or 1, or omit the + * "=VALUE" entirely. (A VALUE of 0 will behave the same as if the + * line was not present.) + */ +extern int readcfgfile(option const* list, FILE *fp, + int (*callback)(int opt, char const *val, void *data), + void *data); + + +/* Create an argc-argv pair from a string containing a command line. + * cmdline is the string to be parsed. argcp points to the variable to + * receive the argc value, and argvp points to the variable to receive + * the argv value. argvp can be NULL if the caller just wants to get + * argc. Zero is returned on failure. This function allocates memory + * on behalf of the caller. The memory is allocated as a single block, + * so it is sufficient to simply free() the pointer returned through + * argvp. Note that argv[0] will always be initialized to NULL; the + * first argument will be stored in argv[1]. The string is parsed by + * separating arguments on whitespace boundaries. Space within + * substrings enclosed in single-quotes is ignored. A substring + * enclosed in double-quotes is treated the same, except that the + * backslash is recognized as an escape character within such a + * substring. Enclosing quotes and escaping backslashes are not copied + * into the argv values. + */ +extern int makecmdline(char const *cmdline, int *argcp, char ***argvp); + +#endif diff --git a/md2html/entity.c b/md2html/entity.c new file mode 100644 index 0000000..9991ca1 --- /dev/null +++ b/md2html/entity.c @@ -0,0 +1,2190 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2017 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "entity.h" +#include + + +/* The table is generated from https://html.spec.whatwg.org/entities.json */ +static const struct entity entity_table[] = { + { "Æ", { 198, 0 } }, + { "&", { 38, 0 } }, + { "Á", { 193, 0 } }, + { "Ă", { 258, 0 } }, + { "Â", { 194, 0 } }, + { "А", { 1040, 0 } }, + { "𝔄", { 120068, 0 } }, + { "À", { 192, 0 } }, + { "Α", { 913, 0 } }, + { "Ā", { 256, 0 } }, + { "⩓", { 10835, 0 } }, + { "Ą", { 260, 0 } }, + { "𝔸", { 120120, 0 } }, + { "⁡", { 8289, 0 } }, + { "Å", { 197, 0 } }, + { "𝒜", { 119964, 0 } }, + { "≔", { 8788, 0 } }, + { "Ã", { 195, 0 } }, + { "Ä", { 196, 0 } }, + { "∖", { 8726, 0 } }, + { "⫧", { 10983, 0 } }, + { "⌆", { 8966, 0 } }, + { "Б", { 1041, 0 } }, + { "∵", { 8757, 0 } }, + { "ℬ", { 8492, 0 } }, + { "Β", { 914, 0 } }, + { "𝔅", { 120069, 0 } }, + { "𝔹", { 120121, 0 } }, + { "˘", { 728, 0 } }, + { "ℬ", { 8492, 0 } }, + { "≎", { 8782, 0 } }, + { "Ч", { 1063, 0 } }, + { "©", { 169, 0 } }, + { "Ć", { 262, 0 } }, + { "⋒", { 8914, 0 } }, + { "ⅅ", { 8517, 0 } }, + { "ℭ", { 8493, 0 } }, + { "Č", { 268, 0 } }, + { "Ç", { 199, 0 } }, + { "Ĉ", { 264, 0 } }, + { "∰", { 8752, 0 } }, + { "Ċ", { 266, 0 } }, + { "¸", { 184, 0 } }, + { "·", { 183, 0 } }, + { "ℭ", { 8493, 0 } }, + { "Χ", { 935, 0 } }, + { "⊙", { 8857, 0 } }, + { "⊖", { 8854, 0 } }, + { "⊕", { 8853, 0 } }, + { "⊗", { 8855, 0 } }, + { "∲", { 8754, 0 } }, + { "”", { 8221, 0 } }, + { "’", { 8217, 0 } }, + { "∷", { 8759, 0 } }, + { "⩴", { 10868, 0 } }, + { "≡", { 8801, 0 } }, + { "∯", { 8751, 0 } }, + { "∮", { 8750, 0 } }, + { "ℂ", { 8450, 0 } }, + { "∐", { 8720, 0 } }, + { "∳", { 8755, 0 } }, + { "⨯", { 10799, 0 } }, + { "𝒞", { 119966, 0 } }, + { "⋓", { 8915, 0 } }, + { "≍", { 8781, 0 } }, + { "ⅅ", { 8517, 0 } }, + { "⤑", { 10513, 0 } }, + { "Ђ", { 1026, 0 } }, + { "Ѕ", { 1029, 0 } }, + { "Џ", { 1039, 0 } }, + { "‡", { 8225, 0 } }, + { "↡", { 8609, 0 } }, + { "⫤", { 10980, 0 } }, + { "Ď", { 270, 0 } }, + { "Д", { 1044, 0 } }, + { "∇", { 8711, 0 } }, + { "Δ", { 916, 0 } }, + { "𝔇", { 120071, 0 } }, + { "´", { 180, 0 } }, + { "˙", { 729, 0 } }, + { "˝", { 733, 0 } }, + { "`", { 96, 0 } }, + { "˜", { 732, 0 } }, + { "⋄", { 8900, 0 } }, + { "ⅆ", { 8518, 0 } }, + { "𝔻", { 120123, 0 } }, + { "¨", { 168, 0 } }, + { "⃜", { 8412, 0 } }, + { "≐", { 8784, 0 } }, + { "∯", { 8751, 0 } }, + { "¨", { 168, 0 } }, + { "⇓", { 8659, 0 } }, + { "⇐", { 8656, 0 } }, + { "⇔", { 8660, 0 } }, + { "⫤", { 10980, 0 } }, + { "⟸", { 10232, 0 } }, + { "⟺", { 10234, 0 } }, + { "⟹", { 10233, 0 } }, + { "⇒", { 8658, 0 } }, + { "⊨", { 8872, 0 } }, + { "⇑", { 8657, 0 } }, + { "⇕", { 8661, 0 } }, + { "∥", { 8741, 0 } }, + { "↓", { 8595, 0 } }, + { "⤓", { 10515, 0 } }, + { "⇵", { 8693, 0 } }, + { "̑", { 785, 0 } }, + { "⥐", { 10576, 0 } }, + { "⥞", { 10590, 0 } }, + { "↽", { 8637, 0 } }, + { "⥖", { 10582, 0 } }, + { "⥟", { 10591, 0 } }, + { "⇁", { 8641, 0 } }, + { "⥗", { 10583, 0 } }, + { "⊤", { 8868, 0 } }, + { "↧", { 8615, 0 } }, + { "⇓", { 8659, 0 } }, + { "𝒟", { 119967, 0 } }, + { "Đ", { 272, 0 } }, + { "Ŋ", { 330, 0 } }, + { "Ð", { 208, 0 } }, + { "É", { 201, 0 } }, + { "Ě", { 282, 0 } }, + { "Ê", { 202, 0 } }, + { "Э", { 1069, 0 } }, + { "Ė", { 278, 0 } }, + { "𝔈", { 120072, 0 } }, + { "È", { 200, 0 } }, + { "∈", { 8712, 0 } }, + { "Ē", { 274, 0 } }, + { "◻", { 9723, 0 } }, + { "▫", { 9643, 0 } }, + { "Ę", { 280, 0 } }, + { "𝔼", { 120124, 0 } }, + { "Ε", { 917, 0 } }, + { "⩵", { 10869, 0 } }, + { "≂", { 8770, 0 } }, + { "⇌", { 8652, 0 } }, + { "ℰ", { 8496, 0 } }, + { "⩳", { 10867, 0 } }, + { "Η", { 919, 0 } }, + { "Ë", { 203, 0 } }, + { "∃", { 8707, 0 } }, + { "ⅇ", { 8519, 0 } }, + { "Ф", { 1060, 0 } }, + { "𝔉", { 120073, 0 } }, + { "◼", { 9724, 0 } }, + { "▪", { 9642, 0 } }, + { "𝔽", { 120125, 0 } }, + { "∀", { 8704, 0 } }, + { "ℱ", { 8497, 0 } }, + { "ℱ", { 8497, 0 } }, + { "Ѓ", { 1027, 0 } }, + { ">", { 62, 0 } }, + { "Γ", { 915, 0 } }, + { "Ϝ", { 988, 0 } }, + { "Ğ", { 286, 0 } }, + { "Ģ", { 290, 0 } }, + { "Ĝ", { 284, 0 } }, + { "Г", { 1043, 0 } }, + { "Ġ", { 288, 0 } }, + { "𝔊", { 120074, 0 } }, + { "⋙", { 8921, 0 } }, + { "𝔾", { 120126, 0 } }, + { "≥", { 8805, 0 } }, + { "⋛", { 8923, 0 } }, + { "≧", { 8807, 0 } }, + { "⪢", { 10914, 0 } }, + { "≷", { 8823, 0 } }, + { "⩾", { 10878, 0 } }, + { "≳", { 8819, 0 } }, + { "𝒢", { 119970, 0 } }, + { "≫", { 8811, 0 } }, + { "Ъ", { 1066, 0 } }, + { "ˇ", { 711, 0 } }, + { "^", { 94, 0 } }, + { "Ĥ", { 292, 0 } }, + { "ℌ", { 8460, 0 } }, + { "ℋ", { 8459, 0 } }, + { "ℍ", { 8461, 0 } }, + { "─", { 9472, 0 } }, + { "ℋ", { 8459, 0 } }, + { "Ħ", { 294, 0 } }, + { "≎", { 8782, 0 } }, + { "≏", { 8783, 0 } }, + { "Е", { 1045, 0 } }, + { "IJ", { 306, 0 } }, + { "Ё", { 1025, 0 } }, + { "Í", { 205, 0 } }, + { "Î", { 206, 0 } }, + { "И", { 1048, 0 } }, + { "İ", { 304, 0 } }, + { "ℑ", { 8465, 0 } }, + { "Ì", { 204, 0 } }, + { "ℑ", { 8465, 0 } }, + { "Ī", { 298, 0 } }, + { "ⅈ", { 8520, 0 } }, + { "⇒", { 8658, 0 } }, + { "∬", { 8748, 0 } }, + { "∫", { 8747, 0 } }, + { "⋂", { 8898, 0 } }, + { "⁣", { 8291, 0 } }, + { "⁢", { 8290, 0 } }, + { "Į", { 302, 0 } }, + { "𝕀", { 120128, 0 } }, + { "Ι", { 921, 0 } }, + { "ℐ", { 8464, 0 } }, + { "Ĩ", { 296, 0 } }, + { "І", { 1030, 0 } }, + { "Ï", { 207, 0 } }, + { "Ĵ", { 308, 0 } }, + { "Й", { 1049, 0 } }, + { "𝔍", { 120077, 0 } }, + { "𝕁", { 120129, 0 } }, + { "𝒥", { 119973, 0 } }, + { "Ј", { 1032, 0 } }, + { "Є", { 1028, 0 } }, + { "Х", { 1061, 0 } }, + { "Ќ", { 1036, 0 } }, + { "Κ", { 922, 0 } }, + { "Ķ", { 310, 0 } }, + { "К", { 1050, 0 } }, + { "𝔎", { 120078, 0 } }, + { "𝕂", { 120130, 0 } }, + { "𝒦", { 119974, 0 } }, + { "Љ", { 1033, 0 } }, + { "<", { 60, 0 } }, + { "Ĺ", { 313, 0 } }, + { "Λ", { 923, 0 } }, + { "⟪", { 10218, 0 } }, + { "ℒ", { 8466, 0 } }, + { "↞", { 8606, 0 } }, + { "Ľ", { 317, 0 } }, + { "Ļ", { 315, 0 } }, + { "Л", { 1051, 0 } }, + { "⟨", { 10216, 0 } }, + { "←", { 8592, 0 } }, + { "⇤", { 8676, 0 } }, + { "⇆", { 8646, 0 } }, + { "⌈", { 8968, 0 } }, + { "⟦", { 10214, 0 } }, + { "⥡", { 10593, 0 } }, + { "⇃", { 8643, 0 } }, + { "⥙", { 10585, 0 } }, + { "⌊", { 8970, 0 } }, + { "↔", { 8596, 0 } }, + { "⥎", { 10574, 0 } }, + { "⊣", { 8867, 0 } }, + { "↤", { 8612, 0 } }, + { "⥚", { 10586, 0 } }, + { "⊲", { 8882, 0 } }, + { "⧏", { 10703, 0 } }, + { "⊴", { 8884, 0 } }, + { "⥑", { 10577, 0 } }, + { "⥠", { 10592, 0 } }, + { "↿", { 8639, 0 } }, + { "⥘", { 10584, 0 } }, + { "↼", { 8636, 0 } }, + { "⥒", { 10578, 0 } }, + { "⇐", { 8656, 0 } }, + { "⇔", { 8660, 0 } }, + { "⋚", { 8922, 0 } }, + { "≦", { 8806, 0 } }, + { "≶", { 8822, 0 } }, + { "⪡", { 10913, 0 } }, + { "⩽", { 10877, 0 } }, + { "≲", { 8818, 0 } }, + { "𝔏", { 120079, 0 } }, + { "⋘", { 8920, 0 } }, + { "⇚", { 8666, 0 } }, + { "Ŀ", { 319, 0 } }, + { "⟵", { 10229, 0 } }, + { "⟷", { 10231, 0 } }, + { "⟶", { 10230, 0 } }, + { "⟸", { 10232, 0 } }, + { "⟺", { 10234, 0 } }, + { "⟹", { 10233, 0 } }, + { "𝕃", { 120131, 0 } }, + { "↙", { 8601, 0 } }, + { "↘", { 8600, 0 } }, + { "ℒ", { 8466, 0 } }, + { "↰", { 8624, 0 } }, + { "Ł", { 321, 0 } }, + { "≪", { 8810, 0 } }, + { "⤅", { 10501, 0 } }, + { "М", { 1052, 0 } }, + { " ", { 8287, 0 } }, + { "ℳ", { 8499, 0 } }, + { "𝔐", { 120080, 0 } }, + { "∓", { 8723, 0 } }, + { "𝕄", { 120132, 0 } }, + { "ℳ", { 8499, 0 } }, + { "Μ", { 924, 0 } }, + { "Њ", { 1034, 0 } }, + { "Ń", { 323, 0 } }, + { "Ň", { 327, 0 } }, + { "Ņ", { 325, 0 } }, + { "Н", { 1053, 0 } }, + { "​", { 8203, 0 } }, + { "​", { 8203, 0 } }, + { "​", { 8203, 0 } }, + { "​", { 8203, 0 } }, + { "≫", { 8811, 0 } }, + { "≪", { 8810, 0 } }, + { " ", { 10, 0 } }, + { "𝔑", { 120081, 0 } }, + { "⁠", { 8288, 0 } }, + { " ", { 160, 0 } }, + { "ℕ", { 8469, 0 } }, + { "⫬", { 10988, 0 } }, + { "≢", { 8802, 0 } }, + { "≭", { 8813, 0 } }, + { "∦", { 8742, 0 } }, + { "∉", { 8713, 0 } }, + { "≠", { 8800, 0 } }, + { "≂̸", { 8770, 824 } }, + { "∄", { 8708, 0 } }, + { "≯", { 8815, 0 } }, + { "≱", { 8817, 0 } }, + { "≧̸", { 8807, 824 } }, + { "≫̸", { 8811, 824 } }, + { "≹", { 8825, 0 } }, + { "⩾̸", { 10878, 824 } }, + { "≵", { 8821, 0 } }, + { "≎̸", { 8782, 824 } }, + { "≏̸", { 8783, 824 } }, + { "⋪", { 8938, 0 } }, + { "⧏̸", { 10703, 824 } }, + { "⋬", { 8940, 0 } }, + { "≮", { 8814, 0 } }, + { "≰", { 8816, 0 } }, + { "≸", { 8824, 0 } }, + { "≪̸", { 8810, 824 } }, + { "⩽̸", { 10877, 824 } }, + { "≴", { 8820, 0 } }, + { "⪢̸", { 10914, 824 } }, + { "⪡̸", { 10913, 824 } }, + { "⊀", { 8832, 0 } }, + { "⪯̸", { 10927, 824 } }, + { "⋠", { 8928, 0 } }, + { "∌", { 8716, 0 } }, + { "⋫", { 8939, 0 } }, + { "⧐̸", { 10704, 824 } }, + { "⋭", { 8941, 0 } }, + { "⊏̸", { 8847, 824 } }, + { "⋢", { 8930, 0 } }, + { "⊐̸", { 8848, 824 } }, + { "⋣", { 8931, 0 } }, + { "⊂⃒", { 8834, 8402 } }, + { "⊈", { 8840, 0 } }, + { "⊁", { 8833, 0 } }, + { "⪰̸", { 10928, 824 } }, + { "⋡", { 8929, 0 } }, + { "≿̸", { 8831, 824 } }, + { "⊃⃒", { 8835, 8402 } }, + { "⊉", { 8841, 0 } }, + { "≁", { 8769, 0 } }, + { "≄", { 8772, 0 } }, + { "≇", { 8775, 0 } }, + { "≉", { 8777, 0 } }, + { "∤", { 8740, 0 } }, + { "𝒩", { 119977, 0 } }, + { "Ñ", { 209, 0 } }, + { "Ν", { 925, 0 } }, + { "Œ", { 338, 0 } }, + { "Ó", { 211, 0 } }, + { "Ô", { 212, 0 } }, + { "О", { 1054, 0 } }, + { "Ő", { 336, 0 } }, + { "𝔒", { 120082, 0 } }, + { "Ò", { 210, 0 } }, + { "Ō", { 332, 0 } }, + { "Ω", { 937, 0 } }, + { "Ο", { 927, 0 } }, + { "𝕆", { 120134, 0 } }, + { "“", { 8220, 0 } }, + { "‘", { 8216, 0 } }, + { "⩔", { 10836, 0 } }, + { "𝒪", { 119978, 0 } }, + { "Ø", { 216, 0 } }, + { "Õ", { 213, 0 } }, + { "⨷", { 10807, 0 } }, + { "Ö", { 214, 0 } }, + { "‾", { 8254, 0 } }, + { "⏞", { 9182, 0 } }, + { "⎴", { 9140, 0 } }, + { "⏜", { 9180, 0 } }, + { "∂", { 8706, 0 } }, + { "П", { 1055, 0 } }, + { "𝔓", { 120083, 0 } }, + { "Φ", { 934, 0 } }, + { "Π", { 928, 0 } }, + { "±", { 177, 0 } }, + { "ℌ", { 8460, 0 } }, + { "ℙ", { 8473, 0 } }, + { "⪻", { 10939, 0 } }, + { "≺", { 8826, 0 } }, + { "⪯", { 10927, 0 } }, + { "≼", { 8828, 0 } }, + { "≾", { 8830, 0 } }, + { "″", { 8243, 0 } }, + { "∏", { 8719, 0 } }, + { "∷", { 8759, 0 } }, + { "∝", { 8733, 0 } }, + { "𝒫", { 119979, 0 } }, + { "Ψ", { 936, 0 } }, + { """, { 34, 0 } }, + { "𝔔", { 120084, 0 } }, + { "ℚ", { 8474, 0 } }, + { "𝒬", { 119980, 0 } }, + { "⤐", { 10512, 0 } }, + { "®", { 174, 0 } }, + { "Ŕ", { 340, 0 } }, + { "⟫", { 10219, 0 } }, + { "↠", { 8608, 0 } }, + { "⤖", { 10518, 0 } }, + { "Ř", { 344, 0 } }, + { "Ŗ", { 342, 0 } }, + { "Р", { 1056, 0 } }, + { "ℜ", { 8476, 0 } }, + { "∋", { 8715, 0 } }, + { "⇋", { 8651, 0 } }, + { "⥯", { 10607, 0 } }, + { "ℜ", { 8476, 0 } }, + { "Ρ", { 929, 0 } }, + { "⟩", { 10217, 0 } }, + { "→", { 8594, 0 } }, + { "⇥", { 8677, 0 } }, + { "⇄", { 8644, 0 } }, + { "⌉", { 8969, 0 } }, + { "⟧", { 10215, 0 } }, + { "⥝", { 10589, 0 } }, + { "⇂", { 8642, 0 } }, + { "⥕", { 10581, 0 } }, + { "⌋", { 8971, 0 } }, + { "⊢", { 8866, 0 } }, + { "↦", { 8614, 0 } }, + { "⥛", { 10587, 0 } }, + { "⊳", { 8883, 0 } }, + { "⧐", { 10704, 0 } }, + { "⊵", { 8885, 0 } }, + { "⥏", { 10575, 0 } }, + { "⥜", { 10588, 0 } }, + { "↾", { 8638, 0 } }, + { "⥔", { 10580, 0 } }, + { "⇀", { 8640, 0 } }, + { "⥓", { 10579, 0 } }, + { "⇒", { 8658, 0 } }, + { "ℝ", { 8477, 0 } }, + { "⥰", { 10608, 0 } }, + { "⇛", { 8667, 0 } }, + { "ℛ", { 8475, 0 } }, + { "↱", { 8625, 0 } }, + { "⧴", { 10740, 0 } }, + { "Щ", { 1065, 0 } }, + { "Ш", { 1064, 0 } }, + { "Ь", { 1068, 0 } }, + { "Ś", { 346, 0 } }, + { "⪼", { 10940, 0 } }, + { "Š", { 352, 0 } }, + { "Ş", { 350, 0 } }, + { "Ŝ", { 348, 0 } }, + { "С", { 1057, 0 } }, + { "𝔖", { 120086, 0 } }, + { "↓", { 8595, 0 } }, + { "←", { 8592, 0 } }, + { "→", { 8594, 0 } }, + { "↑", { 8593, 0 } }, + { "Σ", { 931, 0 } }, + { "∘", { 8728, 0 } }, + { "𝕊", { 120138, 0 } }, + { "√", { 8730, 0 } }, + { "□", { 9633, 0 } }, + { "⊓", { 8851, 0 } }, + { "⊏", { 8847, 0 } }, + { "⊑", { 8849, 0 } }, + { "⊐", { 8848, 0 } }, + { "⊒", { 8850, 0 } }, + { "⊔", { 8852, 0 } }, + { "𝒮", { 119982, 0 } }, + { "⋆", { 8902, 0 } }, + { "⋐", { 8912, 0 } }, + { "⋐", { 8912, 0 } }, + { "⊆", { 8838, 0 } }, + { "≻", { 8827, 0 } }, + { "⪰", { 10928, 0 } }, + { "≽", { 8829, 0 } }, + { "≿", { 8831, 0 } }, + { "∋", { 8715, 0 } }, + { "∑", { 8721, 0 } }, + { "⋑", { 8913, 0 } }, + { "⊃", { 8835, 0 } }, + { "⊇", { 8839, 0 } }, + { "⋑", { 8913, 0 } }, + { "Þ", { 222, 0 } }, + { "™", { 8482, 0 } }, + { "Ћ", { 1035, 0 } }, + { "Ц", { 1062, 0 } }, + { " ", { 9, 0 } }, + { "Τ", { 932, 0 } }, + { "Ť", { 356, 0 } }, + { "Ţ", { 354, 0 } }, + { "Т", { 1058, 0 } }, + { "𝔗", { 120087, 0 } }, + { "∴", { 8756, 0 } }, + { "Θ", { 920, 0 } }, + { "  ", { 8287, 8202 } }, + { " ", { 8201, 0 } }, + { "∼", { 8764, 0 } }, + { "≃", { 8771, 0 } }, + { "≅", { 8773, 0 } }, + { "≈", { 8776, 0 } }, + { "𝕋", { 120139, 0 } }, + { "⃛", { 8411, 0 } }, + { "𝒯", { 119983, 0 } }, + { "Ŧ", { 358, 0 } }, + { "Ú", { 218, 0 } }, + { "↟", { 8607, 0 } }, + { "⥉", { 10569, 0 } }, + { "Ў", { 1038, 0 } }, + { "Ŭ", { 364, 0 } }, + { "Û", { 219, 0 } }, + { "У", { 1059, 0 } }, + { "Ű", { 368, 0 } }, + { "𝔘", { 120088, 0 } }, + { "Ù", { 217, 0 } }, + { "Ū", { 362, 0 } }, + { "_", { 95, 0 } }, + { "⏟", { 9183, 0 } }, + { "⎵", { 9141, 0 } }, + { "⏝", { 9181, 0 } }, + { "⋃", { 8899, 0 } }, + { "⊎", { 8846, 0 } }, + { "Ų", { 370, 0 } }, + { "𝕌", { 120140, 0 } }, + { "↑", { 8593, 0 } }, + { "⤒", { 10514, 0 } }, + { "⇅", { 8645, 0 } }, + { "↕", { 8597, 0 } }, + { "⥮", { 10606, 0 } }, + { "⊥", { 8869, 0 } }, + { "↥", { 8613, 0 } }, + { "⇑", { 8657, 0 } }, + { "⇕", { 8661, 0 } }, + { "↖", { 8598, 0 } }, + { "↗", { 8599, 0 } }, + { "ϒ", { 978, 0 } }, + { "Υ", { 933, 0 } }, + { "Ů", { 366, 0 } }, + { "𝒰", { 119984, 0 } }, + { "Ũ", { 360, 0 } }, + { "Ü", { 220, 0 } }, + { "⊫", { 8875, 0 } }, + { "⫫", { 10987, 0 } }, + { "В", { 1042, 0 } }, + { "⊩", { 8873, 0 } }, + { "⫦", { 10982, 0 } }, + { "⋁", { 8897, 0 } }, + { "‖", { 8214, 0 } }, + { "‖", { 8214, 0 } }, + { "∣", { 8739, 0 } }, + { "|", { 124, 0 } }, + { "❘", { 10072, 0 } }, + { "≀", { 8768, 0 } }, + { " ", { 8202, 0 } }, + { "𝔙", { 120089, 0 } }, + { "𝕍", { 120141, 0 } }, + { "𝒱", { 119985, 0 } }, + { "⊪", { 8874, 0 } }, + { "Ŵ", { 372, 0 } }, + { "⋀", { 8896, 0 } }, + { "𝔚", { 120090, 0 } }, + { "𝕎", { 120142, 0 } }, + { "𝒲", { 119986, 0 } }, + { "𝔛", { 120091, 0 } }, + { "Ξ", { 926, 0 } }, + { "𝕏", { 120143, 0 } }, + { "𝒳", { 119987, 0 } }, + { "Я", { 1071, 0 } }, + { "Ї", { 1031, 0 } }, + { "Ю", { 1070, 0 } }, + { "Ý", { 221, 0 } }, + { "Ŷ", { 374, 0 } }, + { "Ы", { 1067, 0 } }, + { "𝔜", { 120092, 0 } }, + { "𝕐", { 120144, 0 } }, + { "𝒴", { 119988, 0 } }, + { "Ÿ", { 376, 0 } }, + { "Ж", { 1046, 0 } }, + { "Ź", { 377, 0 } }, + { "Ž", { 381, 0 } }, + { "З", { 1047, 0 } }, + { "Ż", { 379, 0 } }, + { "​", { 8203, 0 } }, + { "Ζ", { 918, 0 } }, + { "ℨ", { 8488, 0 } }, + { "ℤ", { 8484, 0 } }, + { "𝒵", { 119989, 0 } }, + { "á", { 225, 0 } }, + { "ă", { 259, 0 } }, + { "∾", { 8766, 0 } }, + { "∾̳", { 8766, 819 } }, + { "∿", { 8767, 0 } }, + { "â", { 226, 0 } }, + { "´", { 180, 0 } }, + { "а", { 1072, 0 } }, + { "æ", { 230, 0 } }, + { "⁡", { 8289, 0 } }, + { "𝔞", { 120094, 0 } }, + { "à", { 224, 0 } }, + { "ℵ", { 8501, 0 } }, + { "ℵ", { 8501, 0 } }, + { "α", { 945, 0 } }, + { "ā", { 257, 0 } }, + { "⨿", { 10815, 0 } }, + { "&", { 38, 0 } }, + { "∧", { 8743, 0 } }, + { "⩕", { 10837, 0 } }, + { "⩜", { 10844, 0 } }, + { "⩘", { 10840, 0 } }, + { "⩚", { 10842, 0 } }, + { "∠", { 8736, 0 } }, + { "⦤", { 10660, 0 } }, + { "∠", { 8736, 0 } }, + { "∡", { 8737, 0 } }, + { "⦨", { 10664, 0 } }, + { "⦩", { 10665, 0 } }, + { "⦪", { 10666, 0 } }, + { "⦫", { 10667, 0 } }, + { "⦬", { 10668, 0 } }, + { "⦭", { 10669, 0 } }, + { "⦮", { 10670, 0 } }, + { "⦯", { 10671, 0 } }, + { "∟", { 8735, 0 } }, + { "⊾", { 8894, 0 } }, + { "⦝", { 10653, 0 } }, + { "∢", { 8738, 0 } }, + { "Å", { 197, 0 } }, + { "⍼", { 9084, 0 } }, + { "ą", { 261, 0 } }, + { "𝕒", { 120146, 0 } }, + { "≈", { 8776, 0 } }, + { "⩰", { 10864, 0 } }, + { "⩯", { 10863, 0 } }, + { "≊", { 8778, 0 } }, + { "≋", { 8779, 0 } }, + { "'", { 39, 0 } }, + { "≈", { 8776, 0 } }, + { "≊", { 8778, 0 } }, + { "å", { 229, 0 } }, + { "𝒶", { 119990, 0 } }, + { "*", { 42, 0 } }, + { "≈", { 8776, 0 } }, + { "≍", { 8781, 0 } }, + { "ã", { 227, 0 } }, + { "ä", { 228, 0 } }, + { "∳", { 8755, 0 } }, + { "⨑", { 10769, 0 } }, + { "⫭", { 10989, 0 } }, + { "≌", { 8780, 0 } }, + { "϶", { 1014, 0 } }, + { "‵", { 8245, 0 } }, + { "∽", { 8765, 0 } }, + { "⋍", { 8909, 0 } }, + { "⊽", { 8893, 0 } }, + { "⌅", { 8965, 0 } }, + { "⌅", { 8965, 0 } }, + { "⎵", { 9141, 0 } }, + { "⎶", { 9142, 0 } }, + { "≌", { 8780, 0 } }, + { "б", { 1073, 0 } }, + { "„", { 8222, 0 } }, + { "∵", { 8757, 0 } }, + { "∵", { 8757, 0 } }, + { "⦰", { 10672, 0 } }, + { "϶", { 1014, 0 } }, + { "ℬ", { 8492, 0 } }, + { "β", { 946, 0 } }, + { "ℶ", { 8502, 0 } }, + { "≬", { 8812, 0 } }, + { "𝔟", { 120095, 0 } }, + { "⋂", { 8898, 0 } }, + { "◯", { 9711, 0 } }, + { "⋃", { 8899, 0 } }, + { "⨀", { 10752, 0 } }, + { "⨁", { 10753, 0 } }, + { "⨂", { 10754, 0 } }, + { "⨆", { 10758, 0 } }, + { "★", { 9733, 0 } }, + { "▽", { 9661, 0 } }, + { "△", { 9651, 0 } }, + { "⨄", { 10756, 0 } }, + { "⋁", { 8897, 0 } }, + { "⋀", { 8896, 0 } }, + { "⤍", { 10509, 0 } }, + { "⧫", { 10731, 0 } }, + { "▪", { 9642, 0 } }, + { "▴", { 9652, 0 } }, + { "▾", { 9662, 0 } }, + { "◂", { 9666, 0 } }, + { "▸", { 9656, 0 } }, + { "␣", { 9251, 0 } }, + { "▒", { 9618, 0 } }, + { "░", { 9617, 0 } }, + { "▓", { 9619, 0 } }, + { "█", { 9608, 0 } }, + { "=⃥", { 61, 8421 } }, + { "≡⃥", { 8801, 8421 } }, + { "⌐", { 8976, 0 } }, + { "𝕓", { 120147, 0 } }, + { "⊥", { 8869, 0 } }, + { "⊥", { 8869, 0 } }, + { "⋈", { 8904, 0 } }, + { "╗", { 9559, 0 } }, + { "╔", { 9556, 0 } }, + { "╖", { 9558, 0 } }, + { "╓", { 9555, 0 } }, + { "═", { 9552, 0 } }, + { "╦", { 9574, 0 } }, + { "╩", { 9577, 0 } }, + { "╤", { 9572, 0 } }, + { "╧", { 9575, 0 } }, + { "╝", { 9565, 0 } }, + { "╚", { 9562, 0 } }, + { "╜", { 9564, 0 } }, + { "╙", { 9561, 0 } }, + { "║", { 9553, 0 } }, + { "╬", { 9580, 0 } }, + { "╣", { 9571, 0 } }, + { "╠", { 9568, 0 } }, + { "╫", { 9579, 0 } }, + { "╢", { 9570, 0 } }, + { "╟", { 9567, 0 } }, + { "⧉", { 10697, 0 } }, + { "╕", { 9557, 0 } }, + { "╒", { 9554, 0 } }, + { "┐", { 9488, 0 } }, + { "┌", { 9484, 0 } }, + { "─", { 9472, 0 } }, + { "╥", { 9573, 0 } }, + { "╨", { 9576, 0 } }, + { "┬", { 9516, 0 } }, + { "┴", { 9524, 0 } }, + { "⊟", { 8863, 0 } }, + { "⊞", { 8862, 0 } }, + { "⊠", { 8864, 0 } }, + { "╛", { 9563, 0 } }, + { "╘", { 9560, 0 } }, + { "┘", { 9496, 0 } }, + { "└", { 9492, 0 } }, + { "│", { 9474, 0 } }, + { "╪", { 9578, 0 } }, + { "╡", { 9569, 0 } }, + { "╞", { 9566, 0 } }, + { "┼", { 9532, 0 } }, + { "┤", { 9508, 0 } }, + { "├", { 9500, 0 } }, + { "‵", { 8245, 0 } }, + { "˘", { 728, 0 } }, + { "¦", { 166, 0 } }, + { "𝒷", { 119991, 0 } }, + { "⁏", { 8271, 0 } }, + { "∽", { 8765, 0 } }, + { "⋍", { 8909, 0 } }, + { "\", { 92, 0 } }, + { "⧅", { 10693, 0 } }, + { "⟈", { 10184, 0 } }, + { "•", { 8226, 0 } }, + { "•", { 8226, 0 } }, + { "≎", { 8782, 0 } }, + { "⪮", { 10926, 0 } }, + { "≏", { 8783, 0 } }, + { "≏", { 8783, 0 } }, + { "ć", { 263, 0 } }, + { "∩", { 8745, 0 } }, + { "⩄", { 10820, 0 } }, + { "⩉", { 10825, 0 } }, + { "⩋", { 10827, 0 } }, + { "⩇", { 10823, 0 } }, + { "⩀", { 10816, 0 } }, + { "∩︀", { 8745, 65024 } }, + { "⁁", { 8257, 0 } }, + { "ˇ", { 711, 0 } }, + { "⩍", { 10829, 0 } }, + { "č", { 269, 0 } }, + { "ç", { 231, 0 } }, + { "ĉ", { 265, 0 } }, + { "⩌", { 10828, 0 } }, + { "⩐", { 10832, 0 } }, + { "ċ", { 267, 0 } }, + { "¸", { 184, 0 } }, + { "⦲", { 10674, 0 } }, + { "¢", { 162, 0 } }, + { "·", { 183, 0 } }, + { "𝔠", { 120096, 0 } }, + { "ч", { 1095, 0 } }, + { "✓", { 10003, 0 } }, + { "✓", { 10003, 0 } }, + { "χ", { 967, 0 } }, + { "○", { 9675, 0 } }, + { "⧃", { 10691, 0 } }, + { "ˆ", { 710, 0 } }, + { "≗", { 8791, 0 } }, + { "↺", { 8634, 0 } }, + { "↻", { 8635, 0 } }, + { "®", { 174, 0 } }, + { "Ⓢ", { 9416, 0 } }, + { "⊛", { 8859, 0 } }, + { "⊚", { 8858, 0 } }, + { "⊝", { 8861, 0 } }, + { "≗", { 8791, 0 } }, + { "⨐", { 10768, 0 } }, + { "⫯", { 10991, 0 } }, + { "⧂", { 10690, 0 } }, + { "♣", { 9827, 0 } }, + { "♣", { 9827, 0 } }, + { ":", { 58, 0 } }, + { "≔", { 8788, 0 } }, + { "≔", { 8788, 0 } }, + { ",", { 44, 0 } }, + { "@", { 64, 0 } }, + { "∁", { 8705, 0 } }, + { "∘", { 8728, 0 } }, + { "∁", { 8705, 0 } }, + { "ℂ", { 8450, 0 } }, + { "≅", { 8773, 0 } }, + { "⩭", { 10861, 0 } }, + { "∮", { 8750, 0 } }, + { "𝕔", { 120148, 0 } }, + { "∐", { 8720, 0 } }, + { "©", { 169, 0 } }, + { "℗", { 8471, 0 } }, + { "↵", { 8629, 0 } }, + { "✗", { 10007, 0 } }, + { "𝒸", { 119992, 0 } }, + { "⫏", { 10959, 0 } }, + { "⫑", { 10961, 0 } }, + { "⫐", { 10960, 0 } }, + { "⫒", { 10962, 0 } }, + { "⋯", { 8943, 0 } }, + { "⤸", { 10552, 0 } }, + { "⤵", { 10549, 0 } }, + { "⋞", { 8926, 0 } }, + { "⋟", { 8927, 0 } }, + { "↶", { 8630, 0 } }, + { "⤽", { 10557, 0 } }, + { "∪", { 8746, 0 } }, + { "⩈", { 10824, 0 } }, + { "⩆", { 10822, 0 } }, + { "⩊", { 10826, 0 } }, + { "⊍", { 8845, 0 } }, + { "⩅", { 10821, 0 } }, + { "∪︀", { 8746, 65024 } }, + { "↷", { 8631, 0 } }, + { "⤼", { 10556, 0 } }, + { "⋞", { 8926, 0 } }, + { "⋟", { 8927, 0 } }, + { "⋎", { 8910, 0 } }, + { "⋏", { 8911, 0 } }, + { "¤", { 164, 0 } }, + { "↶", { 8630, 0 } }, + { "↷", { 8631, 0 } }, + { "⋎", { 8910, 0 } }, + { "⋏", { 8911, 0 } }, + { "∲", { 8754, 0 } }, + { "∱", { 8753, 0 } }, + { "⌭", { 9005, 0 } }, + { "⇓", { 8659, 0 } }, + { "⥥", { 10597, 0 } }, + { "†", { 8224, 0 } }, + { "ℸ", { 8504, 0 } }, + { "↓", { 8595, 0 } }, + { "‐", { 8208, 0 } }, + { "⊣", { 8867, 0 } }, + { "⤏", { 10511, 0 } }, + { "˝", { 733, 0 } }, + { "ď", { 271, 0 } }, + { "д", { 1076, 0 } }, + { "ⅆ", { 8518, 0 } }, + { "‡", { 8225, 0 } }, + { "⇊", { 8650, 0 } }, + { "⩷", { 10871, 0 } }, + { "°", { 176, 0 } }, + { "δ", { 948, 0 } }, + { "⦱", { 10673, 0 } }, + { "⥿", { 10623, 0 } }, + { "𝔡", { 120097, 0 } }, + { "⇃", { 8643, 0 } }, + { "⇂", { 8642, 0 } }, + { "⋄", { 8900, 0 } }, + { "⋄", { 8900, 0 } }, + { "♦", { 9830, 0 } }, + { "♦", { 9830, 0 } }, + { "¨", { 168, 0 } }, + { "ϝ", { 989, 0 } }, + { "⋲", { 8946, 0 } }, + { "÷", { 247, 0 } }, + { "÷", { 247, 0 } }, + { "⋇", { 8903, 0 } }, + { "⋇", { 8903, 0 } }, + { "ђ", { 1106, 0 } }, + { "⌞", { 8990, 0 } }, + { "⌍", { 8973, 0 } }, + { "$", { 36, 0 } }, + { "𝕕", { 120149, 0 } }, + { "˙", { 729, 0 } }, + { "≐", { 8784, 0 } }, + { "≑", { 8785, 0 } }, + { "∸", { 8760, 0 } }, + { "∔", { 8724, 0 } }, + { "⊡", { 8865, 0 } }, + { "⌆", { 8966, 0 } }, + { "↓", { 8595, 0 } }, + { "⇊", { 8650, 0 } }, + { "⇃", { 8643, 0 } }, + { "⇂", { 8642, 0 } }, + { "⤐", { 10512, 0 } }, + { "⌟", { 8991, 0 } }, + { "⌌", { 8972, 0 } }, + { "𝒹", { 119993, 0 } }, + { "ѕ", { 1109, 0 } }, + { "⧶", { 10742, 0 } }, + { "đ", { 273, 0 } }, + { "⋱", { 8945, 0 } }, + { "▿", { 9663, 0 } }, + { "▾", { 9662, 0 } }, + { "⇵", { 8693, 0 } }, + { "⥯", { 10607, 0 } }, + { "⦦", { 10662, 0 } }, + { "џ", { 1119, 0 } }, + { "⟿", { 10239, 0 } }, + { "⩷", { 10871, 0 } }, + { "≑", { 8785, 0 } }, + { "é", { 233, 0 } }, + { "⩮", { 10862, 0 } }, + { "ě", { 283, 0 } }, + { "≖", { 8790, 0 } }, + { "ê", { 234, 0 } }, + { "≕", { 8789, 0 } }, + { "э", { 1101, 0 } }, + { "ė", { 279, 0 } }, + { "ⅇ", { 8519, 0 } }, + { "≒", { 8786, 0 } }, + { "𝔢", { 120098, 0 } }, + { "⪚", { 10906, 0 } }, + { "è", { 232, 0 } }, + { "⪖", { 10902, 0 } }, + { "⪘", { 10904, 0 } }, + { "⪙", { 10905, 0 } }, + { "⏧", { 9191, 0 } }, + { "ℓ", { 8467, 0 } }, + { "⪕", { 10901, 0 } }, + { "⪗", { 10903, 0 } }, + { "ē", { 275, 0 } }, + { "∅", { 8709, 0 } }, + { "∅", { 8709, 0 } }, + { "∅", { 8709, 0 } }, + { " ", { 8196, 0 } }, + { " ", { 8197, 0 } }, + { " ", { 8195, 0 } }, + { "ŋ", { 331, 0 } }, + { " ", { 8194, 0 } }, + { "ę", { 281, 0 } }, + { "𝕖", { 120150, 0 } }, + { "⋕", { 8917, 0 } }, + { "⧣", { 10723, 0 } }, + { "⩱", { 10865, 0 } }, + { "ε", { 949, 0 } }, + { "ε", { 949, 0 } }, + { "ϵ", { 1013, 0 } }, + { "≖", { 8790, 0 } }, + { "≕", { 8789, 0 } }, + { "≂", { 8770, 0 } }, + { "⪖", { 10902, 0 } }, + { "⪕", { 10901, 0 } }, + { "=", { 61, 0 } }, + { "≟", { 8799, 0 } }, + { "≡", { 8801, 0 } }, + { "⩸", { 10872, 0 } }, + { "⧥", { 10725, 0 } }, + { "≓", { 8787, 0 } }, + { "⥱", { 10609, 0 } }, + { "ℯ", { 8495, 0 } }, + { "≐", { 8784, 0 } }, + { "≂", { 8770, 0 } }, + { "η", { 951, 0 } }, + { "ð", { 240, 0 } }, + { "ë", { 235, 0 } }, + { "€", { 8364, 0 } }, + { "!", { 33, 0 } }, + { "∃", { 8707, 0 } }, + { "ℰ", { 8496, 0 } }, + { "ⅇ", { 8519, 0 } }, + { "≒", { 8786, 0 } }, + { "ф", { 1092, 0 } }, + { "♀", { 9792, 0 } }, + { "ffi", { 64259, 0 } }, + { "ff", { 64256, 0 } }, + { "ffl", { 64260, 0 } }, + { "𝔣", { 120099, 0 } }, + { "fi", { 64257, 0 } }, + { "fj", { 102, 106 } }, + { "♭", { 9837, 0 } }, + { "fl", { 64258, 0 } }, + { "▱", { 9649, 0 } }, + { "ƒ", { 402, 0 } }, + { "𝕗", { 120151, 0 } }, + { "∀", { 8704, 0 } }, + { "⋔", { 8916, 0 } }, + { "⫙", { 10969, 0 } }, + { "⨍", { 10765, 0 } }, + { "½", { 189, 0 } }, + { "½", { 189, 0 } }, + { "⅓", { 8531, 0 } }, + { "¼", { 188, 0 } }, + { "¼", { 188, 0 } }, + { "⅕", { 8533, 0 } }, + { "⅙", { 8537, 0 } }, + { "⅛", { 8539, 0 } }, + { "⅔", { 8532, 0 } }, + { "⅖", { 8534, 0 } }, + { "¾", { 190, 0 } }, + { "¾", { 190, 0 } }, + { "⅗", { 8535, 0 } }, + { "⅜", { 8540, 0 } }, + { "⅘", { 8536, 0 } }, + { "⅚", { 8538, 0 } }, + { "⅝", { 8541, 0 } }, + { "⅞", { 8542, 0 } }, + { "⁄", { 8260, 0 } }, + { "⌢", { 8994, 0 } }, + { "𝒻", { 119995, 0 } }, + { "≧", { 8807, 0 } }, + { "⪌", { 10892, 0 } }, + { "ǵ", { 501, 0 } }, + { "γ", { 947, 0 } }, + { "ϝ", { 989, 0 } }, + { "⪆", { 10886, 0 } }, + { "ğ", { 287, 0 } }, + { "ĝ", { 285, 0 } }, + { "г", { 1075, 0 } }, + { "ġ", { 289, 0 } }, + { "≥", { 8805, 0 } }, + { "⋛", { 8923, 0 } }, + { "≥", { 8805, 0 } }, + { "≧", { 8807, 0 } }, + { "⩾", { 10878, 0 } }, + { "⩾", { 10878, 0 } }, + { "⪩", { 10921, 0 } }, + { "⪀", { 10880, 0 } }, + { "⪂", { 10882, 0 } }, + { "⪄", { 10884, 0 } }, + { "⋛︀", { 8923, 65024 } }, + { "⪔", { 10900, 0 } }, + { "𝔤", { 120100, 0 } }, + { "≫", { 8811, 0 } }, + { "⋙", { 8921, 0 } }, + { "ℷ", { 8503, 0 } }, + { "ѓ", { 1107, 0 } }, + { "≷", { 8823, 0 } }, + { "⪒", { 10898, 0 } }, + { "⪥", { 10917, 0 } }, + { "⪤", { 10916, 0 } }, + { "≩", { 8809, 0 } }, + { "⪊", { 10890, 0 } }, + { "⪊", { 10890, 0 } }, + { "⪈", { 10888, 0 } }, + { "⪈", { 10888, 0 } }, + { "≩", { 8809, 0 } }, + { "⋧", { 8935, 0 } }, + { "𝕘", { 120152, 0 } }, + { "`", { 96, 0 } }, + { "ℊ", { 8458, 0 } }, + { "≳", { 8819, 0 } }, + { "⪎", { 10894, 0 } }, + { "⪐", { 10896, 0 } }, + { ">", { 62, 0 } }, + { "⪧", { 10919, 0 } }, + { "⩺", { 10874, 0 } }, + { "⋗", { 8919, 0 } }, + { "⦕", { 10645, 0 } }, + { "⩼", { 10876, 0 } }, + { "⪆", { 10886, 0 } }, + { "⥸", { 10616, 0 } }, + { "⋗", { 8919, 0 } }, + { "⋛", { 8923, 0 } }, + { "⪌", { 10892, 0 } }, + { "≷", { 8823, 0 } }, + { "≳", { 8819, 0 } }, + { "≩︀", { 8809, 65024 } }, + { "≩︀", { 8809, 65024 } }, + { "⇔", { 8660, 0 } }, + { " ", { 8202, 0 } }, + { "½", { 189, 0 } }, + { "ℋ", { 8459, 0 } }, + { "ъ", { 1098, 0 } }, + { "↔", { 8596, 0 } }, + { "⥈", { 10568, 0 } }, + { "↭", { 8621, 0 } }, + { "ℏ", { 8463, 0 } }, + { "ĥ", { 293, 0 } }, + { "♥", { 9829, 0 } }, + { "♥", { 9829, 0 } }, + { "…", { 8230, 0 } }, + { "⊹", { 8889, 0 } }, + { "𝔥", { 120101, 0 } }, + { "⤥", { 10533, 0 } }, + { "⤦", { 10534, 0 } }, + { "⇿", { 8703, 0 } }, + { "∻", { 8763, 0 } }, + { "↩", { 8617, 0 } }, + { "↪", { 8618, 0 } }, + { "𝕙", { 120153, 0 } }, + { "―", { 8213, 0 } }, + { "𝒽", { 119997, 0 } }, + { "ℏ", { 8463, 0 } }, + { "ħ", { 295, 0 } }, + { "⁃", { 8259, 0 } }, + { "‐", { 8208, 0 } }, + { "í", { 237, 0 } }, + { "⁣", { 8291, 0 } }, + { "î", { 238, 0 } }, + { "и", { 1080, 0 } }, + { "е", { 1077, 0 } }, + { "¡", { 161, 0 } }, + { "⇔", { 8660, 0 } }, + { "𝔦", { 120102, 0 } }, + { "ì", { 236, 0 } }, + { "ⅈ", { 8520, 0 } }, + { "⨌", { 10764, 0 } }, + { "∭", { 8749, 0 } }, + { "⧜", { 10716, 0 } }, + { "℩", { 8489, 0 } }, + { "ij", { 307, 0 } }, + { "ī", { 299, 0 } }, + { "ℑ", { 8465, 0 } }, + { "ℐ", { 8464, 0 } }, + { "ℑ", { 8465, 0 } }, + { "ı", { 305, 0 } }, + { "⊷", { 8887, 0 } }, + { "Ƶ", { 437, 0 } }, + { "∈", { 8712, 0 } }, + { "℅", { 8453, 0 } }, + { "∞", { 8734, 0 } }, + { "⧝", { 10717, 0 } }, + { "ı", { 305, 0 } }, + { "∫", { 8747, 0 } }, + { "⊺", { 8890, 0 } }, + { "ℤ", { 8484, 0 } }, + { "⊺", { 8890, 0 } }, + { "⨗", { 10775, 0 } }, + { "⨼", { 10812, 0 } }, + { "ё", { 1105, 0 } }, + { "į", { 303, 0 } }, + { "𝕚", { 120154, 0 } }, + { "ι", { 953, 0 } }, + { "⨼", { 10812, 0 } }, + { "¿", { 191, 0 } }, + { "𝒾", { 119998, 0 } }, + { "∈", { 8712, 0 } }, + { "⋹", { 8953, 0 } }, + { "⋵", { 8949, 0 } }, + { "⋴", { 8948, 0 } }, + { "⋳", { 8947, 0 } }, + { "∈", { 8712, 0 } }, + { "⁢", { 8290, 0 } }, + { "ĩ", { 297, 0 } }, + { "і", { 1110, 0 } }, + { "ï", { 239, 0 } }, + { "ĵ", { 309, 0 } }, + { "й", { 1081, 0 } }, + { "𝔧", { 120103, 0 } }, + { "ȷ", { 567, 0 } }, + { "𝕛", { 120155, 0 } }, + { "𝒿", { 119999, 0 } }, + { "ј", { 1112, 0 } }, + { "є", { 1108, 0 } }, + { "κ", { 954, 0 } }, + { "ϰ", { 1008, 0 } }, + { "ķ", { 311, 0 } }, + { "к", { 1082, 0 } }, + { "𝔨", { 120104, 0 } }, + { "ĸ", { 312, 0 } }, + { "х", { 1093, 0 } }, + { "ќ", { 1116, 0 } }, + { "𝕜", { 120156, 0 } }, + { "𝓀", { 120000, 0 } }, + { "⇚", { 8666, 0 } }, + { "⇐", { 8656, 0 } }, + { "⤛", { 10523, 0 } }, + { "⤎", { 10510, 0 } }, + { "≦", { 8806, 0 } }, + { "⪋", { 10891, 0 } }, + { "⥢", { 10594, 0 } }, + { "ĺ", { 314, 0 } }, + { "⦴", { 10676, 0 } }, + { "ℒ", { 8466, 0 } }, + { "λ", { 955, 0 } }, + { "⟨", { 10216, 0 } }, + { "⦑", { 10641, 0 } }, + { "⟨", { 10216, 0 } }, + { "⪅", { 10885, 0 } }, + { "«", { 171, 0 } }, + { "←", { 8592, 0 } }, + { "⇤", { 8676, 0 } }, + { "⤟", { 10527, 0 } }, + { "⤝", { 10525, 0 } }, + { "↩", { 8617, 0 } }, + { "↫", { 8619, 0 } }, + { "⤹", { 10553, 0 } }, + { "⥳", { 10611, 0 } }, + { "↢", { 8610, 0 } }, + { "⪫", { 10923, 0 } }, + { "⤙", { 10521, 0 } }, + { "⪭", { 10925, 0 } }, + { "⪭︀", { 10925, 65024 } }, + { "⤌", { 10508, 0 } }, + { "❲", { 10098, 0 } }, + { "{", { 123, 0 } }, + { "[", { 91, 0 } }, + { "⦋", { 10635, 0 } }, + { "⦏", { 10639, 0 } }, + { "⦍", { 10637, 0 } }, + { "ľ", { 318, 0 } }, + { "ļ", { 316, 0 } }, + { "⌈", { 8968, 0 } }, + { "{", { 123, 0 } }, + { "л", { 1083, 0 } }, + { "⤶", { 10550, 0 } }, + { "“", { 8220, 0 } }, + { "„", { 8222, 0 } }, + { "⥧", { 10599, 0 } }, + { "⥋", { 10571, 0 } }, + { "↲", { 8626, 0 } }, + { "≤", { 8804, 0 } }, + { "←", { 8592, 0 } }, + { "↢", { 8610, 0 } }, + { "↽", { 8637, 0 } }, + { "↼", { 8636, 0 } }, + { "⇇", { 8647, 0 } }, + { "↔", { 8596, 0 } }, + { "⇆", { 8646, 0 } }, + { "⇋", { 8651, 0 } }, + { "↭", { 8621, 0 } }, + { "⋋", { 8907, 0 } }, + { "⋚", { 8922, 0 } }, + { "≤", { 8804, 0 } }, + { "≦", { 8806, 0 } }, + { "⩽", { 10877, 0 } }, + { "⩽", { 10877, 0 } }, + { "⪨", { 10920, 0 } }, + { "⩿", { 10879, 0 } }, + { "⪁", { 10881, 0 } }, + { "⪃", { 10883, 0 } }, + { "⋚︀", { 8922, 65024 } }, + { "⪓", { 10899, 0 } }, + { "⪅", { 10885, 0 } }, + { "⋖", { 8918, 0 } }, + { "⋚", { 8922, 0 } }, + { "⪋", { 10891, 0 } }, + { "≶", { 8822, 0 } }, + { "≲", { 8818, 0 } }, + { "⥼", { 10620, 0 } }, + { "⌊", { 8970, 0 } }, + { "𝔩", { 120105, 0 } }, + { "≶", { 8822, 0 } }, + { "⪑", { 10897, 0 } }, + { "↽", { 8637, 0 } }, + { "↼", { 8636, 0 } }, + { "⥪", { 10602, 0 } }, + { "▄", { 9604, 0 } }, + { "љ", { 1113, 0 } }, + { "≪", { 8810, 0 } }, + { "⇇", { 8647, 0 } }, + { "⌞", { 8990, 0 } }, + { "⥫", { 10603, 0 } }, + { "◺", { 9722, 0 } }, + { "ŀ", { 320, 0 } }, + { "⎰", { 9136, 0 } }, + { "⎰", { 9136, 0 } }, + { "≨", { 8808, 0 } }, + { "⪉", { 10889, 0 } }, + { "⪉", { 10889, 0 } }, + { "⪇", { 10887, 0 } }, + { "⪇", { 10887, 0 } }, + { "≨", { 8808, 0 } }, + { "⋦", { 8934, 0 } }, + { "⟬", { 10220, 0 } }, + { "⇽", { 8701, 0 } }, + { "⟦", { 10214, 0 } }, + { "⟵", { 10229, 0 } }, + { "⟷", { 10231, 0 } }, + { "⟼", { 10236, 0 } }, + { "⟶", { 10230, 0 } }, + { "↫", { 8619, 0 } }, + { "↬", { 8620, 0 } }, + { "⦅", { 10629, 0 } }, + { "𝕝", { 120157, 0 } }, + { "⨭", { 10797, 0 } }, + { "⨴", { 10804, 0 } }, + { "∗", { 8727, 0 } }, + { "_", { 95, 0 } }, + { "◊", { 9674, 0 } }, + { "◊", { 9674, 0 } }, + { "⧫", { 10731, 0 } }, + { "(", { 40, 0 } }, + { "⦓", { 10643, 0 } }, + { "⇆", { 8646, 0 } }, + { "⌟", { 8991, 0 } }, + { "⇋", { 8651, 0 } }, + { "⥭", { 10605, 0 } }, + { "‎", { 8206, 0 } }, + { "⊿", { 8895, 0 } }, + { "‹", { 8249, 0 } }, + { "𝓁", { 120001, 0 } }, + { "↰", { 8624, 0 } }, + { "≲", { 8818, 0 } }, + { "⪍", { 10893, 0 } }, + { "⪏", { 10895, 0 } }, + { "[", { 91, 0 } }, + { "‘", { 8216, 0 } }, + { "‚", { 8218, 0 } }, + { "ł", { 322, 0 } }, + { "<", { 60, 0 } }, + { "⪦", { 10918, 0 } }, + { "⩹", { 10873, 0 } }, + { "⋖", { 8918, 0 } }, + { "⋋", { 8907, 0 } }, + { "⋉", { 8905, 0 } }, + { "⥶", { 10614, 0 } }, + { "⩻", { 10875, 0 } }, + { "⦖", { 10646, 0 } }, + { "◃", { 9667, 0 } }, + { "⊴", { 8884, 0 } }, + { "◂", { 9666, 0 } }, + { "⥊", { 10570, 0 } }, + { "⥦", { 10598, 0 } }, + { "≨︀", { 8808, 65024 } }, + { "≨︀", { 8808, 65024 } }, + { "∺", { 8762, 0 } }, + { "¯", { 175, 0 } }, + { "♂", { 9794, 0 } }, + { "✠", { 10016, 0 } }, + { "✠", { 10016, 0 } }, + { "↦", { 8614, 0 } }, + { "↦", { 8614, 0 } }, + { "↧", { 8615, 0 } }, + { "↤", { 8612, 0 } }, + { "↥", { 8613, 0 } }, + { "▮", { 9646, 0 } }, + { "⨩", { 10793, 0 } }, + { "м", { 1084, 0 } }, + { "—", { 8212, 0 } }, + { "∡", { 8737, 0 } }, + { "𝔪", { 120106, 0 } }, + { "℧", { 8487, 0 } }, + { "µ", { 181, 0 } }, + { "∣", { 8739, 0 } }, + { "*", { 42, 0 } }, + { "⫰", { 10992, 0 } }, + { "·", { 183, 0 } }, + { "−", { 8722, 0 } }, + { "⊟", { 8863, 0 } }, + { "∸", { 8760, 0 } }, + { "⨪", { 10794, 0 } }, + { "⫛", { 10971, 0 } }, + { "…", { 8230, 0 } }, + { "∓", { 8723, 0 } }, + { "⊧", { 8871, 0 } }, + { "𝕞", { 120158, 0 } }, + { "∓", { 8723, 0 } }, + { "𝓂", { 120002, 0 } }, + { "∾", { 8766, 0 } }, + { "μ", { 956, 0 } }, + { "⊸", { 8888, 0 } }, + { "⊸", { 8888, 0 } }, + { "⋙̸", { 8921, 824 } }, + { "≫⃒", { 8811, 8402 } }, + { "≫̸", { 8811, 824 } }, + { "⇍", { 8653, 0 } }, + { "⇎", { 8654, 0 } }, + { "⋘̸", { 8920, 824 } }, + { "≪⃒", { 8810, 8402 } }, + { "≪̸", { 8810, 824 } }, + { "⇏", { 8655, 0 } }, + { "⊯", { 8879, 0 } }, + { "⊮", { 8878, 0 } }, + { "∇", { 8711, 0 } }, + { "ń", { 324, 0 } }, + { "∠⃒", { 8736, 8402 } }, + { "≉", { 8777, 0 } }, + { "⩰̸", { 10864, 824 } }, + { "≋̸", { 8779, 824 } }, + { "ʼn", { 329, 0 } }, + { "≉", { 8777, 0 } }, + { "♮", { 9838, 0 } }, + { "♮", { 9838, 0 } }, + { "ℕ", { 8469, 0 } }, + { " ", { 160, 0 } }, + { "≎̸", { 8782, 824 } }, + { "≏̸", { 8783, 824 } }, + { "⩃", { 10819, 0 } }, + { "ň", { 328, 0 } }, + { "ņ", { 326, 0 } }, + { "≇", { 8775, 0 } }, + { "⩭̸", { 10861, 824 } }, + { "⩂", { 10818, 0 } }, + { "н", { 1085, 0 } }, + { "–", { 8211, 0 } }, + { "≠", { 8800, 0 } }, + { "⇗", { 8663, 0 } }, + { "⤤", { 10532, 0 } }, + { "↗", { 8599, 0 } }, + { "↗", { 8599, 0 } }, + { "≐̸", { 8784, 824 } }, + { "≢", { 8802, 0 } }, + { "⤨", { 10536, 0 } }, + { "≂̸", { 8770, 824 } }, + { "∄", { 8708, 0 } }, + { "∄", { 8708, 0 } }, + { "𝔫", { 120107, 0 } }, + { "≧̸", { 8807, 824 } }, + { "≱", { 8817, 0 } }, + { "≱", { 8817, 0 } }, + { "≧̸", { 8807, 824 } }, + { "⩾̸", { 10878, 824 } }, + { "⩾̸", { 10878, 824 } }, + { "≵", { 8821, 0 } }, + { "≯", { 8815, 0 } }, + { "≯", { 8815, 0 } }, + { "⇎", { 8654, 0 } }, + { "↮", { 8622, 0 } }, + { "⫲", { 10994, 0 } }, + { "∋", { 8715, 0 } }, + { "⋼", { 8956, 0 } }, + { "⋺", { 8954, 0 } }, + { "∋", { 8715, 0 } }, + { "њ", { 1114, 0 } }, + { "⇍", { 8653, 0 } }, + { "≦̸", { 8806, 824 } }, + { "↚", { 8602, 0 } }, + { "‥", { 8229, 0 } }, + { "≰", { 8816, 0 } }, + { "↚", { 8602, 0 } }, + { "↮", { 8622, 0 } }, + { "≰", { 8816, 0 } }, + { "≦̸", { 8806, 824 } }, + { "⩽̸", { 10877, 824 } }, + { "⩽̸", { 10877, 824 } }, + { "≮", { 8814, 0 } }, + { "≴", { 8820, 0 } }, + { "≮", { 8814, 0 } }, + { "⋪", { 8938, 0 } }, + { "⋬", { 8940, 0 } }, + { "∤", { 8740, 0 } }, + { "𝕟", { 120159, 0 } }, + { "¬", { 172, 0 } }, + { "∉", { 8713, 0 } }, + { "⋹̸", { 8953, 824 } }, + { "⋵̸", { 8949, 824 } }, + { "∉", { 8713, 0 } }, + { "⋷", { 8951, 0 } }, + { "⋶", { 8950, 0 } }, + { "∌", { 8716, 0 } }, + { "∌", { 8716, 0 } }, + { "⋾", { 8958, 0 } }, + { "⋽", { 8957, 0 } }, + { "∦", { 8742, 0 } }, + { "∦", { 8742, 0 } }, + { "⫽⃥", { 11005, 8421 } }, + { "∂̸", { 8706, 824 } }, + { "⨔", { 10772, 0 } }, + { "⊀", { 8832, 0 } }, + { "⋠", { 8928, 0 } }, + { "⪯̸", { 10927, 824 } }, + { "⊀", { 8832, 0 } }, + { "⪯̸", { 10927, 824 } }, + { "⇏", { 8655, 0 } }, + { "↛", { 8603, 0 } }, + { "⤳̸", { 10547, 824 } }, + { "↝̸", { 8605, 824 } }, + { "↛", { 8603, 0 } }, + { "⋫", { 8939, 0 } }, + { "⋭", { 8941, 0 } }, + { "⊁", { 8833, 0 } }, + { "⋡", { 8929, 0 } }, + { "⪰̸", { 10928, 824 } }, + { "𝓃", { 120003, 0 } }, + { "∤", { 8740, 0 } }, + { "∦", { 8742, 0 } }, + { "≁", { 8769, 0 } }, + { "≄", { 8772, 0 } }, + { "≄", { 8772, 0 } }, + { "∤", { 8740, 0 } }, + { "∦", { 8742, 0 } }, + { "⋢", { 8930, 0 } }, + { "⋣", { 8931, 0 } }, + { "⊄", { 8836, 0 } }, + { "⫅̸", { 10949, 824 } }, + { "⊈", { 8840, 0 } }, + { "⊂⃒", { 8834, 8402 } }, + { "⊈", { 8840, 0 } }, + { "⫅̸", { 10949, 824 } }, + { "⊁", { 8833, 0 } }, + { "⪰̸", { 10928, 824 } }, + { "⊅", { 8837, 0 } }, + { "⫆̸", { 10950, 824 } }, + { "⊉", { 8841, 0 } }, + { "⊃⃒", { 8835, 8402 } }, + { "⊉", { 8841, 0 } }, + { "⫆̸", { 10950, 824 } }, + { "≹", { 8825, 0 } }, + { "ñ", { 241, 0 } }, + { "≸", { 8824, 0 } }, + { "⋪", { 8938, 0 } }, + { "⋬", { 8940, 0 } }, + { "⋫", { 8939, 0 } }, + { "⋭", { 8941, 0 } }, + { "ν", { 957, 0 } }, + { "#", { 35, 0 } }, + { "№", { 8470, 0 } }, + { " ", { 8199, 0 } }, + { "⊭", { 8877, 0 } }, + { "⤄", { 10500, 0 } }, + { "≍⃒", { 8781, 8402 } }, + { "⊬", { 8876, 0 } }, + { "≥⃒", { 8805, 8402 } }, + { ">⃒", { 62, 8402 } }, + { "⧞", { 10718, 0 } }, + { "⤂", { 10498, 0 } }, + { "≤⃒", { 8804, 8402 } }, + { "<⃒", { 60, 8402 } }, + { "⊴⃒", { 8884, 8402 } }, + { "⤃", { 10499, 0 } }, + { "⊵⃒", { 8885, 8402 } }, + { "∼⃒", { 8764, 8402 } }, + { "⇖", { 8662, 0 } }, + { "⤣", { 10531, 0 } }, + { "↖", { 8598, 0 } }, + { "↖", { 8598, 0 } }, + { "⤧", { 10535, 0 } }, + { "Ⓢ", { 9416, 0 } }, + { "ó", { 243, 0 } }, + { "⊛", { 8859, 0 } }, + { "⊚", { 8858, 0 } }, + { "ô", { 244, 0 } }, + { "о", { 1086, 0 } }, + { "⊝", { 8861, 0 } }, + { "ő", { 337, 0 } }, + { "⨸", { 10808, 0 } }, + { "⊙", { 8857, 0 } }, + { "⦼", { 10684, 0 } }, + { "œ", { 339, 0 } }, + { "⦿", { 10687, 0 } }, + { "𝔬", { 120108, 0 } }, + { "˛", { 731, 0 } }, + { "ò", { 242, 0 } }, + { "⧁", { 10689, 0 } }, + { "⦵", { 10677, 0 } }, + { "Ω", { 937, 0 } }, + { "∮", { 8750, 0 } }, + { "↺", { 8634, 0 } }, + { "⦾", { 10686, 0 } }, + { "⦻", { 10683, 0 } }, + { "‾", { 8254, 0 } }, + { "⧀", { 10688, 0 } }, + { "ō", { 333, 0 } }, + { "ω", { 969, 0 } }, + { "ο", { 959, 0 } }, + { "⦶", { 10678, 0 } }, + { "⊖", { 8854, 0 } }, + { "𝕠", { 120160, 0 } }, + { "⦷", { 10679, 0 } }, + { "⦹", { 10681, 0 } }, + { "⊕", { 8853, 0 } }, + { "∨", { 8744, 0 } }, + { "↻", { 8635, 0 } }, + { "⩝", { 10845, 0 } }, + { "ℴ", { 8500, 0 } }, + { "ℴ", { 8500, 0 } }, + { "ª", { 170, 0 } }, + { "º", { 186, 0 } }, + { "⊶", { 8886, 0 } }, + { "⩖", { 10838, 0 } }, + { "⩗", { 10839, 0 } }, + { "⩛", { 10843, 0 } }, + { "ℴ", { 8500, 0 } }, + { "ø", { 248, 0 } }, + { "⊘", { 8856, 0 } }, + { "õ", { 245, 0 } }, + { "⊗", { 8855, 0 } }, + { "⨶", { 10806, 0 } }, + { "ö", { 246, 0 } }, + { "⌽", { 9021, 0 } }, + { "∥", { 8741, 0 } }, + { "¶", { 182, 0 } }, + { "∥", { 8741, 0 } }, + { "⫳", { 10995, 0 } }, + { "⫽", { 11005, 0 } }, + { "∂", { 8706, 0 } }, + { "п", { 1087, 0 } }, + { "%", { 37, 0 } }, + { ".", { 46, 0 } }, + { "‰", { 8240, 0 } }, + { "⊥", { 8869, 0 } }, + { "‱", { 8241, 0 } }, + { "𝔭", { 120109, 0 } }, + { "φ", { 966, 0 } }, + { "ϕ", { 981, 0 } }, + { "ℳ", { 8499, 0 } }, + { "☎", { 9742, 0 } }, + { "π", { 960, 0 } }, + { "⋔", { 8916, 0 } }, + { "ϖ", { 982, 0 } }, + { "ℏ", { 8463, 0 } }, + { "ℎ", { 8462, 0 } }, + { "ℏ", { 8463, 0 } }, + { "+", { 43, 0 } }, + { "⨣", { 10787, 0 } }, + { "⊞", { 8862, 0 } }, + { "⨢", { 10786, 0 } }, + { "∔", { 8724, 0 } }, + { "⨥", { 10789, 0 } }, + { "⩲", { 10866, 0 } }, + { "±", { 177, 0 } }, + { "⨦", { 10790, 0 } }, + { "⨧", { 10791, 0 } }, + { "±", { 177, 0 } }, + { "⨕", { 10773, 0 } }, + { "𝕡", { 120161, 0 } }, + { "£", { 163, 0 } }, + { "≺", { 8826, 0 } }, + { "⪳", { 10931, 0 } }, + { "⪷", { 10935, 0 } }, + { "≼", { 8828, 0 } }, + { "⪯", { 10927, 0 } }, + { "≺", { 8826, 0 } }, + { "⪷", { 10935, 0 } }, + { "≼", { 8828, 0 } }, + { "⪯", { 10927, 0 } }, + { "⪹", { 10937, 0 } }, + { "⪵", { 10933, 0 } }, + { "⋨", { 8936, 0 } }, + { "≾", { 8830, 0 } }, + { "′", { 8242, 0 } }, + { "ℙ", { 8473, 0 } }, + { "⪵", { 10933, 0 } }, + { "⪹", { 10937, 0 } }, + { "⋨", { 8936, 0 } }, + { "∏", { 8719, 0 } }, + { "⌮", { 9006, 0 } }, + { "⌒", { 8978, 0 } }, + { "⌓", { 8979, 0 } }, + { "∝", { 8733, 0 } }, + { "∝", { 8733, 0 } }, + { "≾", { 8830, 0 } }, + { "⊰", { 8880, 0 } }, + { "𝓅", { 120005, 0 } }, + { "ψ", { 968, 0 } }, + { " ", { 8200, 0 } }, + { "𝔮", { 120110, 0 } }, + { "⨌", { 10764, 0 } }, + { "𝕢", { 120162, 0 } }, + { "⁗", { 8279, 0 } }, + { "𝓆", { 120006, 0 } }, + { "ℍ", { 8461, 0 } }, + { "⨖", { 10774, 0 } }, + { "?", { 63, 0 } }, + { "≟", { 8799, 0 } }, + { """, { 34, 0 } }, + { "⇛", { 8667, 0 } }, + { "⇒", { 8658, 0 } }, + { "⤜", { 10524, 0 } }, + { "⤏", { 10511, 0 } }, + { "⥤", { 10596, 0 } }, + { "∽̱", { 8765, 817 } }, + { "ŕ", { 341, 0 } }, + { "√", { 8730, 0 } }, + { "⦳", { 10675, 0 } }, + { "⟩", { 10217, 0 } }, + { "⦒", { 10642, 0 } }, + { "⦥", { 10661, 0 } }, + { "⟩", { 10217, 0 } }, + { "»", { 187, 0 } }, + { "→", { 8594, 0 } }, + { "⥵", { 10613, 0 } }, + { "⇥", { 8677, 0 } }, + { "⤠", { 10528, 0 } }, + { "⤳", { 10547, 0 } }, + { "⤞", { 10526, 0 } }, + { "↪", { 8618, 0 } }, + { "↬", { 8620, 0 } }, + { "⥅", { 10565, 0 } }, + { "⥴", { 10612, 0 } }, + { "↣", { 8611, 0 } }, + { "↝", { 8605, 0 } }, + { "⤚", { 10522, 0 } }, + { "∶", { 8758, 0 } }, + { "ℚ", { 8474, 0 } }, + { "⤍", { 10509, 0 } }, + { "❳", { 10099, 0 } }, + { "}", { 125, 0 } }, + { "]", { 93, 0 } }, + { "⦌", { 10636, 0 } }, + { "⦎", { 10638, 0 } }, + { "⦐", { 10640, 0 } }, + { "ř", { 345, 0 } }, + { "ŗ", { 343, 0 } }, + { "⌉", { 8969, 0 } }, + { "}", { 125, 0 } }, + { "р", { 1088, 0 } }, + { "⤷", { 10551, 0 } }, + { "⥩", { 10601, 0 } }, + { "”", { 8221, 0 } }, + { "”", { 8221, 0 } }, + { "↳", { 8627, 0 } }, + { "ℜ", { 8476, 0 } }, + { "ℛ", { 8475, 0 } }, + { "ℜ", { 8476, 0 } }, + { "ℝ", { 8477, 0 } }, + { "▭", { 9645, 0 } }, + { "®", { 174, 0 } }, + { "⥽", { 10621, 0 } }, + { "⌋", { 8971, 0 } }, + { "𝔯", { 120111, 0 } }, + { "⇁", { 8641, 0 } }, + { "⇀", { 8640, 0 } }, + { "⥬", { 10604, 0 } }, + { "ρ", { 961, 0 } }, + { "ϱ", { 1009, 0 } }, + { "→", { 8594, 0 } }, + { "↣", { 8611, 0 } }, + { "⇁", { 8641, 0 } }, + { "⇀", { 8640, 0 } }, + { "⇄", { 8644, 0 } }, + { "⇌", { 8652, 0 } }, + { "⇉", { 8649, 0 } }, + { "↝", { 8605, 0 } }, + { "⋌", { 8908, 0 } }, + { "˚", { 730, 0 } }, + { "≓", { 8787, 0 } }, + { "⇄", { 8644, 0 } }, + { "⇌", { 8652, 0 } }, + { "‏", { 8207, 0 } }, + { "⎱", { 9137, 0 } }, + { "⎱", { 9137, 0 } }, + { "⫮", { 10990, 0 } }, + { "⟭", { 10221, 0 } }, + { "⇾", { 8702, 0 } }, + { "⟧", { 10215, 0 } }, + { "⦆", { 10630, 0 } }, + { "𝕣", { 120163, 0 } }, + { "⨮", { 10798, 0 } }, + { "⨵", { 10805, 0 } }, + { ")", { 41, 0 } }, + { "⦔", { 10644, 0 } }, + { "⨒", { 10770, 0 } }, + { "⇉", { 8649, 0 } }, + { "›", { 8250, 0 } }, + { "𝓇", { 120007, 0 } }, + { "↱", { 8625, 0 } }, + { "]", { 93, 0 } }, + { "’", { 8217, 0 } }, + { "’", { 8217, 0 } }, + { "⋌", { 8908, 0 } }, + { "⋊", { 8906, 0 } }, + { "▹", { 9657, 0 } }, + { "⊵", { 8885, 0 } }, + { "▸", { 9656, 0 } }, + { "⧎", { 10702, 0 } }, + { "⥨", { 10600, 0 } }, + { "℞", { 8478, 0 } }, + { "ś", { 347, 0 } }, + { "‚", { 8218, 0 } }, + { "≻", { 8827, 0 } }, + { "⪴", { 10932, 0 } }, + { "⪸", { 10936, 0 } }, + { "š", { 353, 0 } }, + { "≽", { 8829, 0 } }, + { "⪰", { 10928, 0 } }, + { "ş", { 351, 0 } }, + { "ŝ", { 349, 0 } }, + { "⪶", { 10934, 0 } }, + { "⪺", { 10938, 0 } }, + { "⋩", { 8937, 0 } }, + { "⨓", { 10771, 0 } }, + { "≿", { 8831, 0 } }, + { "с", { 1089, 0 } }, + { "⋅", { 8901, 0 } }, + { "⊡", { 8865, 0 } }, + { "⩦", { 10854, 0 } }, + { "⇘", { 8664, 0 } }, + { "⤥", { 10533, 0 } }, + { "↘", { 8600, 0 } }, + { "↘", { 8600, 0 } }, + { "§", { 167, 0 } }, + { ";", { 59, 0 } }, + { "⤩", { 10537, 0 } }, + { "∖", { 8726, 0 } }, + { "∖", { 8726, 0 } }, + { "✶", { 10038, 0 } }, + { "𝔰", { 120112, 0 } }, + { "⌢", { 8994, 0 } }, + { "♯", { 9839, 0 } }, + { "щ", { 1097, 0 } }, + { "ш", { 1096, 0 } }, + { "∣", { 8739, 0 } }, + { "∥", { 8741, 0 } }, + { "­", { 173, 0 } }, + { "σ", { 963, 0 } }, + { "ς", { 962, 0 } }, + { "ς", { 962, 0 } }, + { "∼", { 8764, 0 } }, + { "⩪", { 10858, 0 } }, + { "≃", { 8771, 0 } }, + { "≃", { 8771, 0 } }, + { "⪞", { 10910, 0 } }, + { "⪠", { 10912, 0 } }, + { "⪝", { 10909, 0 } }, + { "⪟", { 10911, 0 } }, + { "≆", { 8774, 0 } }, + { "⨤", { 10788, 0 } }, + { "⥲", { 10610, 0 } }, + { "←", { 8592, 0 } }, + { "∖", { 8726, 0 } }, + { "⨳", { 10803, 0 } }, + { "⧤", { 10724, 0 } }, + { "∣", { 8739, 0 } }, + { "⌣", { 8995, 0 } }, + { "⪪", { 10922, 0 } }, + { "⪬", { 10924, 0 } }, + { "⪬︀", { 10924, 65024 } }, + { "ь", { 1100, 0 } }, + { "/", { 47, 0 } }, + { "⧄", { 10692, 0 } }, + { "⌿", { 9023, 0 } }, + { "𝕤", { 120164, 0 } }, + { "♠", { 9824, 0 } }, + { "♠", { 9824, 0 } }, + { "∥", { 8741, 0 } }, + { "⊓", { 8851, 0 } }, + { "⊓︀", { 8851, 65024 } }, + { "⊔", { 8852, 0 } }, + { "⊔︀", { 8852, 65024 } }, + { "⊏", { 8847, 0 } }, + { "⊑", { 8849, 0 } }, + { "⊏", { 8847, 0 } }, + { "⊑", { 8849, 0 } }, + { "⊐", { 8848, 0 } }, + { "⊒", { 8850, 0 } }, + { "⊐", { 8848, 0 } }, + { "⊒", { 8850, 0 } }, + { "□", { 9633, 0 } }, + { "□", { 9633, 0 } }, + { "▪", { 9642, 0 } }, + { "▪", { 9642, 0 } }, + { "→", { 8594, 0 } }, + { "𝓈", { 120008, 0 } }, + { "∖", { 8726, 0 } }, + { "⌣", { 8995, 0 } }, + { "⋆", { 8902, 0 } }, + { "☆", { 9734, 0 } }, + { "★", { 9733, 0 } }, + { "ϵ", { 1013, 0 } }, + { "ϕ", { 981, 0 } }, + { "¯", { 175, 0 } }, + { "⊂", { 8834, 0 } }, + { "⫅", { 10949, 0 } }, + { "⪽", { 10941, 0 } }, + { "⊆", { 8838, 0 } }, + { "⫃", { 10947, 0 } }, + { "⫁", { 10945, 0 } }, + { "⫋", { 10955, 0 } }, + { "⊊", { 8842, 0 } }, + { "⪿", { 10943, 0 } }, + { "⥹", { 10617, 0 } }, + { "⊂", { 8834, 0 } }, + { "⊆", { 8838, 0 } }, + { "⫅", { 10949, 0 } }, + { "⊊", { 8842, 0 } }, + { "⫋", { 10955, 0 } }, + { "⫇", { 10951, 0 } }, + { "⫕", { 10965, 0 } }, + { "⫓", { 10963, 0 } }, + { "≻", { 8827, 0 } }, + { "⪸", { 10936, 0 } }, + { "≽", { 8829, 0 } }, + { "⪰", { 10928, 0 } }, + { "⪺", { 10938, 0 } }, + { "⪶", { 10934, 0 } }, + { "⋩", { 8937, 0 } }, + { "≿", { 8831, 0 } }, + { "∑", { 8721, 0 } }, + { "♪", { 9834, 0 } }, + { "¹", { 185, 0 } }, + { "¹", { 185, 0 } }, + { "²", { 178, 0 } }, + { "²", { 178, 0 } }, + { "³", { 179, 0 } }, + { "³", { 179, 0 } }, + { "⊃", { 8835, 0 } }, + { "⫆", { 10950, 0 } }, + { "⪾", { 10942, 0 } }, + { "⫘", { 10968, 0 } }, + { "⊇", { 8839, 0 } }, + { "⫄", { 10948, 0 } }, + { "⟉", { 10185, 0 } }, + { "⫗", { 10967, 0 } }, + { "⥻", { 10619, 0 } }, + { "⫂", { 10946, 0 } }, + { "⫌", { 10956, 0 } }, + { "⊋", { 8843, 0 } }, + { "⫀", { 10944, 0 } }, + { "⊃", { 8835, 0 } }, + { "⊇", { 8839, 0 } }, + { "⫆", { 10950, 0 } }, + { "⊋", { 8843, 0 } }, + { "⫌", { 10956, 0 } }, + { "⫈", { 10952, 0 } }, + { "⫔", { 10964, 0 } }, + { "⫖", { 10966, 0 } }, + { "⇙", { 8665, 0 } }, + { "⤦", { 10534, 0 } }, + { "↙", { 8601, 0 } }, + { "↙", { 8601, 0 } }, + { "⤪", { 10538, 0 } }, + { "ß", { 223, 0 } }, + { "⌖", { 8982, 0 } }, + { "τ", { 964, 0 } }, + { "⎴", { 9140, 0 } }, + { "ť", { 357, 0 } }, + { "ţ", { 355, 0 } }, + { "т", { 1090, 0 } }, + { "⃛", { 8411, 0 } }, + { "⌕", { 8981, 0 } }, + { "𝔱", { 120113, 0 } }, + { "∴", { 8756, 0 } }, + { "∴", { 8756, 0 } }, + { "θ", { 952, 0 } }, + { "ϑ", { 977, 0 } }, + { "ϑ", { 977, 0 } }, + { "≈", { 8776, 0 } }, + { "∼", { 8764, 0 } }, + { " ", { 8201, 0 } }, + { "≈", { 8776, 0 } }, + { "∼", { 8764, 0 } }, + { "þ", { 254, 0 } }, + { "˜", { 732, 0 } }, + { "×", { 215, 0 } }, + { "⊠", { 8864, 0 } }, + { "⨱", { 10801, 0 } }, + { "⨰", { 10800, 0 } }, + { "∭", { 8749, 0 } }, + { "⤨", { 10536, 0 } }, + { "⊤", { 8868, 0 } }, + { "⌶", { 9014, 0 } }, + { "⫱", { 10993, 0 } }, + { "𝕥", { 120165, 0 } }, + { "⫚", { 10970, 0 } }, + { "⤩", { 10537, 0 } }, + { "‴", { 8244, 0 } }, + { "™", { 8482, 0 } }, + { "▵", { 9653, 0 } }, + { "▿", { 9663, 0 } }, + { "◃", { 9667, 0 } }, + { "⊴", { 8884, 0 } }, + { "≜", { 8796, 0 } }, + { "▹", { 9657, 0 } }, + { "⊵", { 8885, 0 } }, + { "◬", { 9708, 0 } }, + { "≜", { 8796, 0 } }, + { "⨺", { 10810, 0 } }, + { "⨹", { 10809, 0 } }, + { "⧍", { 10701, 0 } }, + { "⨻", { 10811, 0 } }, + { "⏢", { 9186, 0 } }, + { "𝓉", { 120009, 0 } }, + { "ц", { 1094, 0 } }, + { "ћ", { 1115, 0 } }, + { "ŧ", { 359, 0 } }, + { "≬", { 8812, 0 } }, + { "↞", { 8606, 0 } }, + { "↠", { 8608, 0 } }, + { "⇑", { 8657, 0 } }, + { "⥣", { 10595, 0 } }, + { "ú", { 250, 0 } }, + { "↑", { 8593, 0 } }, + { "ў", { 1118, 0 } }, + { "ŭ", { 365, 0 } }, + { "û", { 251, 0 } }, + { "у", { 1091, 0 } }, + { "⇅", { 8645, 0 } }, + { "ű", { 369, 0 } }, + { "⥮", { 10606, 0 } }, + { "⥾", { 10622, 0 } }, + { "𝔲", { 120114, 0 } }, + { "ù", { 249, 0 } }, + { "↿", { 8639, 0 } }, + { "↾", { 8638, 0 } }, + { "▀", { 9600, 0 } }, + { "⌜", { 8988, 0 } }, + { "⌜", { 8988, 0 } }, + { "⌏", { 8975, 0 } }, + { "◸", { 9720, 0 } }, + { "ū", { 363, 0 } }, + { "¨", { 168, 0 } }, + { "ų", { 371, 0 } }, + { "𝕦", { 120166, 0 } }, + { "↑", { 8593, 0 } }, + { "↕", { 8597, 0 } }, + { "↿", { 8639, 0 } }, + { "↾", { 8638, 0 } }, + { "⊎", { 8846, 0 } }, + { "υ", { 965, 0 } }, + { "ϒ", { 978, 0 } }, + { "υ", { 965, 0 } }, + { "⇈", { 8648, 0 } }, + { "⌝", { 8989, 0 } }, + { "⌝", { 8989, 0 } }, + { "⌎", { 8974, 0 } }, + { "ů", { 367, 0 } }, + { "◹", { 9721, 0 } }, + { "𝓊", { 120010, 0 } }, + { "⋰", { 8944, 0 } }, + { "ũ", { 361, 0 } }, + { "▵", { 9653, 0 } }, + { "▴", { 9652, 0 } }, + { "⇈", { 8648, 0 } }, + { "ü", { 252, 0 } }, + { "⦧", { 10663, 0 } }, + { "⇕", { 8661, 0 } }, + { "⫨", { 10984, 0 } }, + { "⫩", { 10985, 0 } }, + { "⊨", { 8872, 0 } }, + { "⦜", { 10652, 0 } }, + { "ϵ", { 1013, 0 } }, + { "ϰ", { 1008, 0 } }, + { "∅", { 8709, 0 } }, + { "ϕ", { 981, 0 } }, + { "ϖ", { 982, 0 } }, + { "∝", { 8733, 0 } }, + { "↕", { 8597, 0 } }, + { "ϱ", { 1009, 0 } }, + { "ς", { 962, 0 } }, + { "⊊︀", { 8842, 65024 } }, + { "⫋︀", { 10955, 65024 } }, + { "⊋︀", { 8843, 65024 } }, + { "⫌︀", { 10956, 65024 } }, + { "ϑ", { 977, 0 } }, + { "⊲", { 8882, 0 } }, + { "⊳", { 8883, 0 } }, + { "в", { 1074, 0 } }, + { "⊢", { 8866, 0 } }, + { "∨", { 8744, 0 } }, + { "⊻", { 8891, 0 } }, + { "≚", { 8794, 0 } }, + { "⋮", { 8942, 0 } }, + { "|", { 124, 0 } }, + { "|", { 124, 0 } }, + { "𝔳", { 120115, 0 } }, + { "⊲", { 8882, 0 } }, + { "⊂⃒", { 8834, 8402 } }, + { "⊃⃒", { 8835, 8402 } }, + { "𝕧", { 120167, 0 } }, + { "∝", { 8733, 0 } }, + { "⊳", { 8883, 0 } }, + { "𝓋", { 120011, 0 } }, + { "⫋︀", { 10955, 65024 } }, + { "⊊︀", { 8842, 65024 } }, + { "⫌︀", { 10956, 65024 } }, + { "⊋︀", { 8843, 65024 } }, + { "⦚", { 10650, 0 } }, + { "ŵ", { 373, 0 } }, + { "⩟", { 10847, 0 } }, + { "∧", { 8743, 0 } }, + { "≙", { 8793, 0 } }, + { "℘", { 8472, 0 } }, + { "𝔴", { 120116, 0 } }, + { "𝕨", { 120168, 0 } }, + { "℘", { 8472, 0 } }, + { "≀", { 8768, 0 } }, + { "≀", { 8768, 0 } }, + { "𝓌", { 120012, 0 } }, + { "⋂", { 8898, 0 } }, + { "◯", { 9711, 0 } }, + { "⋃", { 8899, 0 } }, + { "▽", { 9661, 0 } }, + { "𝔵", { 120117, 0 } }, + { "⟺", { 10234, 0 } }, + { "⟷", { 10231, 0 } }, + { "ξ", { 958, 0 } }, + { "⟸", { 10232, 0 } }, + { "⟵", { 10229, 0 } }, + { "⟼", { 10236, 0 } }, + { "⋻", { 8955, 0 } }, + { "⨀", { 10752, 0 } }, + { "𝕩", { 120169, 0 } }, + { "⨁", { 10753, 0 } }, + { "⨂", { 10754, 0 } }, + { "⟹", { 10233, 0 } }, + { "⟶", { 10230, 0 } }, + { "𝓍", { 120013, 0 } }, + { "⨆", { 10758, 0 } }, + { "⨄", { 10756, 0 } }, + { "△", { 9651, 0 } }, + { "⋁", { 8897, 0 } }, + { "⋀", { 8896, 0 } }, + { "ý", { 253, 0 } }, + { "я", { 1103, 0 } }, + { "ŷ", { 375, 0 } }, + { "ы", { 1099, 0 } }, + { "¥", { 165, 0 } }, + { "𝔶", { 120118, 0 } }, + { "ї", { 1111, 0 } }, + { "𝕪", { 120170, 0 } }, + { "𝓎", { 120014, 0 } }, + { "ю", { 1102, 0 } }, + { "ÿ", { 255, 0 } }, + { "ź", { 378, 0 } }, + { "ž", { 382, 0 } }, + { "з", { 1079, 0 } }, + { "ż", { 380, 0 } }, + { "ℨ", { 8488, 0 } }, + { "ζ", { 950, 0 } }, + { "𝔷", { 120119, 0 } }, + { "ж", { 1078, 0 } }, + { "⇝", { 8669, 0 } }, + { "𝕫", { 120171, 0 } }, + { "𝓏", { 120015, 0 } }, + { "‍", { 8205, 0 } }, + { "‌", { 8204, 0 } } +}; + + +struct entity_key { + const char* name; + size_t name_size; +}; + +static int +entity_cmp(const void* p_key, const void* p_entity) +{ + struct entity_key* key = (struct entity_key*) p_key; + struct entity* ent = (struct entity*) p_entity; + + return strncmp(key->name, ent->name, key->name_size); +} + +const struct entity* +entity_lookup(const char* name, size_t name_size) +{ + struct entity_key key = { name, name_size }; + + return bsearch(&key, + entity_table, + sizeof(entity_table) / sizeof(entity_table[0]), + sizeof(struct entity), + entity_cmp); +} diff --git a/md2html/entity.h b/md2html/entity.h new file mode 100644 index 0000000..9e8e54a --- /dev/null +++ b/md2html/entity.h @@ -0,0 +1,42 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2017 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef MD2HTML_ENTITY_H +#define MD2HTML_ENTITY_H + +#include + + +/* Most entities are formed by single Unicode codepoint, few by two codepoints. + * Single-codepoint entities have codepoints[1] set to zero. */ +struct entity { + const char* name; + unsigned codepoints[2]; +}; + +const struct entity* entity_lookup(const char* name, size_t name_size); + + +#endif /* MD2HTML_ENTITY_H */ diff --git a/md2html/md2html.1 b/md2html/md2html.1 new file mode 100644 index 0000000..cffaee8 --- /dev/null +++ b/md2html/md2html.1 @@ -0,0 +1,113 @@ +.TH MD2HTML 1 "June 2019" "" "General Commands Manual" +.nh +.ad l +. +.SH NAME +. +md2html \- convert Markdown to HTML +. +.SH SYNOPSIS +. +.B md2html +.RI [ OPTION ]...\& +.RI [ FILE ] +. +.SH OPTIONS +. +.SS General options: +. +.TP +.BR -o ", " --output= \fIOUTFILE\fR +Write output to \fIOUTFILE\fR instead of \fBstdout\fR(3) +. +.TP +.BR -f ", " --full-html +Generate full HTML document, including header +. +.TP +.BR -s ", " --stat +Measure time of input parsing +. +.TP +.BR -h ", " --help +Display help and exit +. +.TP +.BR -v ", " --version +Display version and exit +. +.SS Markdown dialect options: +. +.TP +.B --commonmark +CommonMark (the default) +. +.TP +.B --github +Github Flavored Markdown +. +.PP +Note: dialect options are equivalent to some combination of flags below. +. +.SS Markdown extension options: +. +.TP +.B --fcollapse-whitespace +Collapse non-trivial whitespace +. +.TP +.B --fverbatim-entities +Do not translate entities +. +.TP +.B --fpermissive-atx-headers +Allow ATX headers without delimiting space +. +.TP +.B --fpermissive-url-autolinks +Allow URL autolinks without "<" and ">" delimiters +. +.TP +.B --fpermissive-www-autolinks +Allow WWW autolinks without any scheme (e.g. "www.example.com") +. +.TP +.B --fpermissive-email-autolinks +Allow e-mail autolinks without "<", ">" and "mailto:" +. +.TP +.B --fpermissive-autolinks +Enable all 3 of the above permissive autolinks options +. +.TP +.B --fno-indented-code +Disable indented code blocks +. +.TP +.B --fno-html-blocks +Disable raw HTML blocks +. +.TP +.B --fno-html-spans +Disable raw HTML spans +. +.TP +.B --fno-html +Same as \fB--fno-html-blocks --fno-html-spans\fR +. +.TP +.B --ftables +Enable tables +. +.TP +.B --fstrikethrough +Enable strikethrough spans +. +.TP +.B --ftasklists +Enable task lists +. +.SH SEE ALSO +. +https://github.com/mity/md4c +. diff --git a/md2html/md2html.c b/md2html/md2html.c new file mode 100644 index 0000000..a3015e6 --- /dev/null +++ b/md2html/md2html.c @@ -0,0 +1,371 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2017 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "render_html.h" +#include "cmdline.h" + + + +/* Global options. */ +static unsigned parser_flags = 0; +static unsigned renderer_flags = MD_RENDER_FLAG_DEBUG; +static int want_fullhtml = 0; +static int want_stat = 0; + + +/********************************* + *** Simple grow-able buffer *** + *********************************/ + +/* We render to a memory buffer instead of directly outputting the rendered + * documents, as this allows using this utility for evaluating performance + * of MD4C (--stat option). This allows us to measure just time of the parser, + * without the I/O. + */ + +struct membuffer { + char* data; + size_t asize; + size_t size; +}; + +static void +membuf_init(struct membuffer* buf, MD_SIZE new_asize) +{ + buf->size = 0; + buf->asize = new_asize; + buf->data = malloc(buf->asize); + if(buf->data == NULL) { + fprintf(stderr, "membuf_init: malloc() failed.\n"); + exit(1); + } +} + +static void +membuf_fini(struct membuffer* buf) +{ + if(buf->data) + free(buf->data); +} + +static void +membuf_grow(struct membuffer* buf, size_t new_asize) +{ + buf->data = realloc(buf->data, new_asize); + if(buf->data == NULL) { + fprintf(stderr, "membuf_grow: realloc() failed.\n"); + exit(1); + } + buf->asize = new_asize; +} + +static void +membuf_append(struct membuffer* buf, const char* data, MD_SIZE size) +{ + if(buf->asize < buf->size + size) + membuf_grow(buf, buf->size + buf->size / 2 + size); + memcpy(buf->data + buf->size, data, size); + buf->size += size; +} + + +/********************** + *** Main program *** + **********************/ + +static void +process_output(const MD_CHAR* text, MD_SIZE size, void* userdata) +{ + membuf_append((struct membuffer*) userdata, text, size); +} + +static int +process_file(FILE* in, FILE* out) +{ + MD_SIZE n; + struct membuffer buf_in = {0}; + struct membuffer buf_out = {0}; + int ret = -1; + clock_t t0, t1; + + membuf_init(&buf_in, 32 * 1024); + + /* Read the input file into a buffer. */ + while(1) { + if(buf_in.size >= buf_in.asize) + membuf_grow(&buf_in, buf_in.asize + buf_in.asize / 2); + + n = fread(buf_in.data + buf_in.size, 1, buf_in.asize - buf_in.size, in); + if(n == 0) + break; + buf_in.size += n; + } + + /* Input size is good estimation of output size. Add some more reserve to + * deal with the HTML header/footer and tags. */ + membuf_init(&buf_out, buf_in.size + buf_in.size/8 + 64); + + /* Parse the document. This shall call our callbacks provided via the + * md_renderer_t structure. */ + t0 = clock(); + + ret = md_render_html(buf_in.data, buf_in.size, process_output, + (void*) &buf_out, parser_flags, renderer_flags); + + t1 = clock(); + if(ret != 0) { + fprintf(stderr, "Parsing failed.\n"); + goto out; + } + + /* Write down the document in the HTML format. */ + if(want_fullhtml) { + fprintf(out, "\n"); + fprintf(out, "\n"); + fprintf(out, "\n"); + fprintf(out, "\n"); + fprintf(out, "\n"); + fprintf(out, "\n"); + } + + fwrite(buf_out.data, 1, buf_out.size, out); + + if(want_fullhtml) { + fprintf(out, "\n"); + fprintf(out, "\n"); + } + + if(want_stat) { + if(t0 != (clock_t)-1 && t1 != (clock_t)-1) { + double elapsed = (double)(t1 - t0) / CLOCKS_PER_SEC; + if (elapsed < 1) + fprintf(stderr, "Time spent on parsing: %7.2f ms.\n", elapsed*1e3); + else + fprintf(stderr, "Time spent on parsing: %6.3f s.\n", elapsed); + } + } + + /* Success if we have reached here. */ + ret = 0; + +out: + membuf_fini(&buf_in); + membuf_fini(&buf_out); + + return ret; +} + + +#define OPTION_ARG_NONE 0 +#define OPTION_ARG_REQUIRED 1 +#define OPTION_ARG_OPTIONAL 2 + +static const option cmdline_options[] = { + { "output", 'o', 'o', OPTION_ARG_REQUIRED }, + { "full-html", 'f', 'f', OPTION_ARG_NONE }, + { "stat", 's', 's', OPTION_ARG_NONE }, + { "help", 'h', 'h', OPTION_ARG_NONE }, + { "version", 'v', 'v', OPTION_ARG_NONE }, + + { "commonmark", 0, 'c', OPTION_ARG_NONE }, + { "github", 0, 'g', OPTION_ARG_NONE }, + + { "fcollapse-whitespace", 0, 'W', OPTION_ARG_NONE }, + { "flatex-math", 0, 'L', OPTION_ARG_NONE }, + { "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE }, + { "fpermissive-autolinks", 0, 'V', OPTION_ARG_NONE }, + { "fpermissive-email-autolinks", 0, '@', OPTION_ARG_NONE }, + { "fpermissive-url-autolinks", 0, 'U', OPTION_ARG_NONE }, + { "fpermissive-www-autolinks", 0, '.', OPTION_ARG_NONE }, + { "fstrikethrough", 0, 'S', OPTION_ARG_NONE }, + { "ftables", 0, 'T', OPTION_ARG_NONE }, + { "ftasklists", 0, 'X', OPTION_ARG_NONE }, + { "funderline", 0, '_', OPTION_ARG_NONE }, + { "fverbatim-entities", 0, 'E', OPTION_ARG_NONE }, + { "fwiki-links", 0, 'K', OPTION_ARG_NONE }, + + { "fno-html-blocks", 0, 'F', OPTION_ARG_NONE }, + { "fno-html-spans", 0, 'G', OPTION_ARG_NONE }, + { "fno-html", 0, 'H', OPTION_ARG_NONE }, + { "fno-indented-code", 0, 'I', OPTION_ARG_NONE }, + + { 0 } +}; + +static void +usage(void) +{ + printf( + "Usage: md2html [OPTION]... [FILE]\n" + "Convert input FILE (or standard input) in Markdown format to HTML.\n" + "\n" + "General options:\n" + " -o --output=FILE Output file (default is standard output)\n" + " -f, --full-html Generate full HTML document, including header\n" + " -s, --stat Measure time of input parsing\n" + " -h, --help Display this help and exit\n" + " -v, --version Display version and exit\n" + "\n" + "Markdown dialect options:\n" + "(note these are equivalent to some combinations of the flags below)\n" + " --commonmark CommonMark (this is default)\n" + " --github Github Flavored Markdown\n" + "\n" + "Markdown extension options:\n" + " --fcollapse-whitespace\n" + " Collapse non-trivial whitespace\n" + " --flatex-math Enable LaTeX style mathematics spans\n" + " --fpermissive-atx-headers\n" + " Allow ATX headers without delimiting space\n" + " --fpermissive-url-autolinks\n" + " Allow URL autolinks without '<', '>'\n" + " --fpermissive-www-autolinks\n" + " Allow WWW autolinks without any scheme (e.g. 'www.example.com')\n" + " --fpermissive-email-autolinks \n" + " Allow e-mail autolinks without '<', '>' and 'mailto:'\n" + " --fpermissive-autolinks\n" + " Same as --fpermissive-url-autolinks --fpermissive-www-autolinks\n" + " --fpermissive-email-autolinks\n" + " --fstrikethrough Enable strike-through spans\n" + " --ftables Enable tables\n" + " --ftasklists Enable task lists\n" + " --funderline Enable underline spans\n" + " --fwiki-links Enable wiki links\n" + "\n" + "Markdown suppression options:\n" + " --fno-html-blocks\n" + " Disable raw HTML blocks\n" + " --fno-html-spans\n" + " Disable raw HTML spans\n" + " --fno-html Same as --fno-html-blocks --fno-html-spans\n" + " --fno-indented-code\n" + " Disable indented code blocks\n" + "\n" + "HTML generator options:\n" + " --fverbatim-entities\n" + " Do not translate entities\n" + "\n" + ); +} + +static void +version(void) +{ + printf("%d.%d.%d\n", MD_VERSION_MAJOR, MD_VERSION_MINOR, MD_VERSION_RELEASE); +} + +static const char* input_path = NULL; +static const char* output_path = NULL; + +static int +cmdline_callback(int opt, char const* value, void* data) +{ + switch(opt) { + case 0: + if(input_path) { + fprintf(stderr, "Too many arguments. Only one input file can be specified.\n"); + fprintf(stderr, "Use --help for more info.\n"); + exit(1); + } + input_path = value; + break; + + case 'o': output_path = value; break; + case 'f': want_fullhtml = 1; break; + case 's': want_stat = 1; break; + case 'h': usage(); exit(0); break; + case 'v': version(); exit(0); break; + + case 'c': parser_flags = MD_DIALECT_COMMONMARK; break; + case 'g': parser_flags = MD_DIALECT_GITHUB; break; + + case 'E': renderer_flags |= MD_RENDER_FLAG_VERBATIM_ENTITIES; break; + case 'A': parser_flags |= MD_FLAG_PERMISSIVEATXHEADERS; break; + case 'I': parser_flags |= MD_FLAG_NOINDENTEDCODEBLOCKS; break; + case 'F': parser_flags |= MD_FLAG_NOHTMLBLOCKS; break; + case 'G': parser_flags |= MD_FLAG_NOHTMLSPANS; break; + case 'H': parser_flags |= MD_FLAG_NOHTML; break; + case 'W': parser_flags |= MD_FLAG_COLLAPSEWHITESPACE; break; + case 'U': parser_flags |= MD_FLAG_PERMISSIVEURLAUTOLINKS; break; + case '.': parser_flags |= MD_FLAG_PERMISSIVEWWWAUTOLINKS; break; + case '@': parser_flags |= MD_FLAG_PERMISSIVEEMAILAUTOLINKS; break; + case 'V': parser_flags |= MD_FLAG_PERMISSIVEAUTOLINKS; break; + case 'T': parser_flags |= MD_FLAG_TABLES; break; + case 'S': parser_flags |= MD_FLAG_STRIKETHROUGH; break; + case 'L': parser_flags |= MD_FLAG_LATEXMATHSPANS; break; + case 'K': parser_flags |= MD_FLAG_WIKILINKS; break; + case 'X': parser_flags |= MD_FLAG_TASKLISTS; break; + case '_': parser_flags |= MD_FLAG_UNDERLINE; break; + + default: + fprintf(stderr, "Illegal option: %s\n", value); + fprintf(stderr, "Use --help for more info.\n"); + exit(1); + break; + } + + return 0; +} + +int +main(int argc, char** argv) +{ + FILE* in = stdin; + FILE* out = stdout; + int ret = 0; + + if(readoptions(cmdline_options, argc, argv, cmdline_callback, NULL) < 0) { + usage(); + exit(1); + } + + if(input_path != NULL && strcmp(input_path, "-") != 0) { + in = fopen(input_path, "rb"); + if(in == NULL) { + fprintf(stderr, "Cannot open %s.\n", input_path); + exit(1); + } + } + if(output_path != NULL && strcmp(output_path, "-") != 0) { + out = fopen(output_path, "wt"); + if(out == NULL) { + fprintf(stderr, "Cannot open %s.\n", output_path); + exit(1); + } + } + + ret = process_file(in, out); + if(in != stdin) + fclose(in); + if(out != stdout) + fclose(out); + + return ret; +} diff --git a/md2html/render_html.c b/md2html/render_html.c new file mode 100644 index 0000000..896b37f --- /dev/null +++ b/md2html/render_html.c @@ -0,0 +1,561 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2019 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "render_html.h" +#include "entity.h" + + +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L + /* C89/90 or old compilers in general may not understand "inline". */ + #if defined __GNUC__ + #define inline __inline__ + #elif defined _MSC_VER + #define inline __inline + #else + #define inline + #endif +#endif + +#ifdef _WIN32 + #define snprintf _snprintf +#endif + + + +typedef struct MD_RENDER_HTML_tag MD_RENDER_HTML; +struct MD_RENDER_HTML_tag { + void (*process_output)(const MD_CHAR*, MD_SIZE, void*); + void* userdata; + unsigned flags; + int image_nesting_level; + char escape_map[256]; +}; + +#define NEED_HTML_ESC_FLAG 0x1 +#define NEED_URL_ESC_FLAG 0x2 + + +/***************************************** + *** HTML rendering helper functions *** + *****************************************/ + +#define ISDIGIT(ch) ('0' <= (ch) && (ch) <= '9') +#define ISLOWER(ch) ('a' <= (ch) && (ch) <= 'z') +#define ISUPPER(ch) ('A' <= (ch) && (ch) <= 'Z') +#define ISALNUM(ch) (ISLOWER(ch) || ISUPPER(ch) || ISDIGIT(ch)) + + +static inline void +render_verbatim(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size) +{ + r->process_output(text, size, r->userdata); +} + +/* Keep this as a macro. Most compiler should then be smart enough to replace + * the strlen() call with a compile-time constant if the string is a C literal. */ +#define RENDER_VERBATIM(r, verbatim) \ + render_verbatim((r), (verbatim), (MD_SIZE) (strlen(verbatim))) + + +static void +render_html_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size) +{ + MD_OFFSET beg = 0; + MD_OFFSET off = 0; + + /* Some characters need to be escaped in normal HTML text. */ + #define NEED_HTML_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_HTML_ESC_FLAG) + + while(1) { + /* Optimization: Use some loop unrolling. */ + while(off + 3 < size && !NEED_HTML_ESC(data[off+0]) && !NEED_HTML_ESC(data[off+1]) + && !NEED_HTML_ESC(data[off+2]) && !NEED_HTML_ESC(data[off+3])) + off += 4; + while(off < size && !NEED_HTML_ESC(data[off])) + off++; + + if(off > beg) + render_verbatim(r, data + beg, off - beg); + + if(off < size) { + switch(data[off]) { + case '&': RENDER_VERBATIM(r, "&"); break; + case '<': RENDER_VERBATIM(r, "<"); break; + case '>': RENDER_VERBATIM(r, ">"); break; + case '"': RENDER_VERBATIM(r, """); break; + } + off++; + } else { + break; + } + beg = off; + } +} + +static void +render_url_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size) +{ + static const MD_CHAR hex_chars[] = "0123456789ABCDEF"; + MD_OFFSET beg = 0; + MD_OFFSET off = 0; + + /* Some characters need to be escaped in URL attributes. */ + #define NEED_URL_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_URL_ESC_FLAG) + + while(1) { + while(off < size && !NEED_URL_ESC(data[off])) + off++; + if(off > beg) + render_verbatim(r, data + beg, off - beg); + + if(off < size) { + char hex[3]; + + switch(data[off]) { + case '&': RENDER_VERBATIM(r, "&"); break; + default: + hex[0] = '%'; + hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf]; + hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf]; + render_verbatim(r, hex, 3); + break; + } + off++; + } else { + break; + } + + beg = off; + } +} + +static unsigned +hex_val(char ch) +{ + if('0' <= ch && ch <= '9') + return ch - '0'; + if('A' <= ch && ch <= 'Z') + return ch - 'A' + 10; + else + return ch - 'a' + 10; +} + +static void +render_utf8_codepoint(MD_RENDER_HTML* r, unsigned codepoint, + void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE)) +{ + static const MD_CHAR utf8_replacement_char[] = { 0xef, 0xbf, 0xbd }; + + unsigned char utf8[4]; + size_t n; + + if(codepoint <= 0x7f) { + n = 1; + utf8[0] = codepoint; + } else if(codepoint <= 0x7ff) { + n = 2; + utf8[0] = 0xc0 | ((codepoint >> 6) & 0x1f); + utf8[1] = 0x80 + ((codepoint >> 0) & 0x3f); + } else if(codepoint <= 0xffff) { + n = 3; + utf8[0] = 0xe0 | ((codepoint >> 12) & 0xf); + utf8[1] = 0x80 + ((codepoint >> 6) & 0x3f); + utf8[2] = 0x80 + ((codepoint >> 0) & 0x3f); + } else { + n = 4; + utf8[0] = 0xf0 | ((codepoint >> 18) & 0x7); + utf8[1] = 0x80 + ((codepoint >> 12) & 0x3f); + utf8[2] = 0x80 + ((codepoint >> 6) & 0x3f); + utf8[3] = 0x80 + ((codepoint >> 0) & 0x3f); + } + + if(0 < codepoint && codepoint <= 0x10ffff) + fn_append(r, (char*)utf8, n); + else + fn_append(r, utf8_replacement_char, 3); +} + +/* Translate entity to its UTF-8 equivalent, or output the verbatim one + * if such entity is unknown (or if the translation is disabled). */ +static void +render_entity(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size, + void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE)) +{ + if(r->flags & MD_RENDER_FLAG_VERBATIM_ENTITIES) { + fn_append(r, text, size); + return; + } + + /* We assume UTF-8 output is what is desired. */ + if(size > 3 && text[1] == '#') { + unsigned codepoint = 0; + + if(text[2] == 'x' || text[2] == 'X') { + /* Hexadecimal entity (e.g. "�")). */ + MD_SIZE i; + for(i = 3; i < size-1; i++) + codepoint = 16 * codepoint + hex_val(text[i]); + } else { + /* Decimal entity (e.g. "&1234;") */ + MD_SIZE i; + for(i = 2; i < size-1; i++) + codepoint = 10 * codepoint + (text[i] - '0'); + } + + render_utf8_codepoint(r, codepoint, fn_append); + return; + } else { + /* Named entity (e.g. " "). */ + const struct entity* ent; + + ent = entity_lookup(text, size); + if(ent != NULL) { + render_utf8_codepoint(r, ent->codepoints[0], fn_append); + if(ent->codepoints[1]) + render_utf8_codepoint(r, ent->codepoints[1], fn_append); + return; + } + } + + fn_append(r, text, size); +} + +static void +render_attribute(MD_RENDER_HTML* r, const MD_ATTRIBUTE* attr, + void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE)) +{ + int i; + + for(i = 0; attr->substr_offsets[i] < attr->size; i++) { + MD_TEXTTYPE type = attr->substr_types[i]; + MD_OFFSET off = attr->substr_offsets[i]; + MD_SIZE size = attr->substr_offsets[i+1] - off; + const MD_CHAR* text = attr->text + off; + + switch(type) { + case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, render_verbatim); break; + case MD_TEXT_ENTITY: render_entity(r, text, size, fn_append); break; + default: fn_append(r, text, size); break; + } + } +} + + +static void +render_open_ol_block(MD_RENDER_HTML* r, const MD_BLOCK_OL_DETAIL* det) +{ + char buf[64]; + + if(det->start == 1) { + RENDER_VERBATIM(r, "
    \n"); + return; + } + + snprintf(buf, sizeof(buf), "
      \n", det->start); + RENDER_VERBATIM(r, buf); +} + +static void +render_open_li_block(MD_RENDER_HTML* r, const MD_BLOCK_LI_DETAIL* det) +{ + if(det->is_task) { + RENDER_VERBATIM(r, "
    1. " + "task_mark == 'x' || det->task_mark == 'X') + RENDER_VERBATIM(r, " checked"); + RENDER_VERBATIM(r, ">"); + } else { + RENDER_VERBATIM(r, "
    2. "); + } +} + +static void +render_open_code_block(MD_RENDER_HTML* r, const MD_BLOCK_CODE_DETAIL* det) +{ + RENDER_VERBATIM(r, "
      lang.text != NULL) {
      +        RENDER_VERBATIM(r, " class=\"language-");
      +        render_attribute(r, &det->lang, render_html_escaped);
      +        RENDER_VERBATIM(r, "\"");
      +    }
      +
      +    RENDER_VERBATIM(r, ">");
      +}
      +
      +static void
      +render_open_td_block(MD_RENDER_HTML* r, const MD_CHAR* cell_type, const MD_BLOCK_TD_DETAIL* det)
      +{
      +    RENDER_VERBATIM(r, "<");
      +    RENDER_VERBATIM(r, cell_type);
      +
      +    switch(det->align) {
      +        case MD_ALIGN_LEFT:     RENDER_VERBATIM(r, " align=\"left\">"); break;
      +        case MD_ALIGN_CENTER:   RENDER_VERBATIM(r, " align=\"center\">"); break;
      +        case MD_ALIGN_RIGHT:    RENDER_VERBATIM(r, " align=\"right\">"); break;
      +        default:                RENDER_VERBATIM(r, ">"); break;
      +    }
      +}
      +
      +static void
      +render_open_a_span(MD_RENDER_HTML* r, const MD_SPAN_A_DETAIL* det)
      +{
      +    RENDER_VERBATIM(r, "href, render_url_escaped);
      +
      +    if(det->title.text != NULL) {
      +        RENDER_VERBATIM(r, "\" title=\"");
      +        render_attribute(r, &det->title, render_html_escaped);
      +    }
      +
      +    RENDER_VERBATIM(r, "\">");
      +}
      +
      +static void
      +render_open_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
      +{
      +    RENDER_VERBATIM(r, "src, render_url_escaped);
      +
      +    RENDER_VERBATIM(r, "\" alt=\"");
      +
      +    r->image_nesting_level++;
      +}
      +
      +static void
      +render_close_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
      +{
      +    if(det->title.text != NULL) {
      +        RENDER_VERBATIM(r, "\" title=\"");
      +        render_attribute(r, &det->title, render_html_escaped);
      +    }
      +
      +    RENDER_VERBATIM(r, "\">");
      +
      +    r->image_nesting_level--;
      +}
      +
      +static void
      +render_open_wikilink_span(MD_RENDER_HTML* r, const MD_SPAN_WIKILINK_DETAIL* det)
      +{
      +    RENDER_VERBATIM(r, "target, render_html_escaped);
      +
      +    RENDER_VERBATIM(r, "\">");
      +}
      +
      +
      +/**************************************
      + ***  HTML renderer implementation  ***
      + **************************************/
      +
      +static int
      +enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
      +{
      +    static const MD_CHAR* head[6] = { "

      ", "

      ", "

      ", "

      ", "

      ", "
      " }; + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + + switch(type) { + case MD_BLOCK_DOC: /* noop */ break; + case MD_BLOCK_QUOTE: RENDER_VERBATIM(r, "
      \n"); break; + case MD_BLOCK_UL: RENDER_VERBATIM(r, "
        \n"); break; + case MD_BLOCK_OL: render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break; + case MD_BLOCK_LI: render_open_li_block(r, (const MD_BLOCK_LI_DETAIL*)detail); break; + case MD_BLOCK_HR: RENDER_VERBATIM(r, "
        \n"); break; + case MD_BLOCK_H: RENDER_VERBATIM(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; + case MD_BLOCK_CODE: render_open_code_block(r, (const MD_BLOCK_CODE_DETAIL*) detail); break; + case MD_BLOCK_HTML: /* noop */ break; + case MD_BLOCK_P: RENDER_VERBATIM(r, "

        "); break; + case MD_BLOCK_TABLE: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_THEAD: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TBODY: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TR: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TH: render_open_td_block(r, "th", (MD_BLOCK_TD_DETAIL*)detail); break; + case MD_BLOCK_TD: render_open_td_block(r, "td", (MD_BLOCK_TD_DETAIL*)detail); break; + } + + return 0; +} + +static int +leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) +{ + static const MD_CHAR* head[6] = { "\n", "\n", "\n", "\n", "\n", "\n" }; + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + + switch(type) { + case MD_BLOCK_DOC: /*noop*/ break; + case MD_BLOCK_QUOTE: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_UL: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_OL: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_LI: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_HR: /*noop*/ break; + case MD_BLOCK_H: RENDER_VERBATIM(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; + case MD_BLOCK_CODE: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_HTML: /* noop */ break; + case MD_BLOCK_P: RENDER_VERBATIM(r, "

        \n"); break; + case MD_BLOCK_TABLE: RENDER_VERBATIM(r, "
        \n"); break; + case MD_BLOCK_THEAD: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TBODY: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TR: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TH: RENDER_VERBATIM(r, "\n"); break; + case MD_BLOCK_TD: RENDER_VERBATIM(r, "\n"); break; + } + + return 0; +} + +static int +enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) +{ + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + + if(r->image_nesting_level > 0) { + /* We are inside a Markdown image label. Markdown allows to use any + * emphasis and other rich contents in that context similarly as in + * any link label. + * + * However, unlike in the case of links (where that contents becomes + * contents of the
        ... tag), in the case of images the contents + * is supposed to fall into the attribute alt: .... + * + * In that context we naturally cannot output nested HTML tags. So lets + * suppress them and only output the plain text (i.e. what falls into + * text() callback). + * + * This make-it-a-plain-text approach is the recommended practice by + * CommonMark specification (for HTML output). + */ + return 0; + } + + switch(type) { + case MD_SPAN_EM: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_STRONG: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_U: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_A: render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break; + case MD_SPAN_IMG: render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break; + case MD_SPAN_CODE: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_DEL: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_LATEXMATH: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_LATEXMATH_DISPLAY: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_WIKILINK: render_open_wikilink_span(r, (MD_SPAN_WIKILINK_DETAIL*) detail); break; + } + + return 0; +} + +static int +leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) +{ + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + + if(r->image_nesting_level > 0) { + /* Ditto as in enter_span_callback(), except we have to allow the + * end of the tag. */ + if(r->image_nesting_level == 1 && type == MD_SPAN_IMG) + render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); + return 0; + } + + switch(type) { + case MD_SPAN_EM: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_STRONG: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_U: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_A: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_IMG: /*noop, handled above*/ break; + case MD_SPAN_CODE: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_DEL: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_LATEXMATH: /*fall through*/ + case MD_SPAN_LATEXMATH_DISPLAY: RENDER_VERBATIM(r, ""); break; + case MD_SPAN_WIKILINK: RENDER_VERBATIM(r, ""); break; + } + + return 0; +} + +static int +text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdata) +{ + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + + switch(type) { + case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, render_verbatim); break; + case MD_TEXT_BR: RENDER_VERBATIM(r, (r->image_nesting_level == 0 ? "
        \n" : " ")); break; + case MD_TEXT_SOFTBR: RENDER_VERBATIM(r, (r->image_nesting_level == 0 ? "\n" : " ")); break; + case MD_TEXT_HTML: render_verbatim(r, text, size); break; + case MD_TEXT_ENTITY: render_entity(r, text, size, render_html_escaped); break; + default: render_html_escaped(r, text, size); break; + } + + return 0; +} + +static void +debug_log_callback(const char* msg, void* userdata) +{ + MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata; + if(r->flags & MD_RENDER_FLAG_DEBUG) + fprintf(stderr, "MD4C: %s\n", msg); +} + +int +md_render_html(const MD_CHAR* input, MD_SIZE input_size, + void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned parser_flags, unsigned renderer_flags) +{ + MD_RENDER_HTML render = { process_output, userdata, renderer_flags, 0, { 0 } }; + int i; + + MD_PARSER parser = { + 0, + parser_flags, + enter_block_callback, + leave_block_callback, + enter_span_callback, + leave_span_callback, + text_callback, + debug_log_callback, + NULL + }; + + /* Build map of characters which need escaping. */ + for(i = 0; i < 256; i++) { + unsigned char ch = (unsigned char) i; + + if(strchr("\"&<>", ch) != NULL) + render.escape_map[i] |= NEED_HTML_ESC_FLAG; + + if(!ISALNUM(ch) && strchr("-_.+!*(),%#@?=;:/,+$", ch) == NULL) + render.escape_map[i] |= NEED_URL_ESC_FLAG; + } + + return md_parse(input, input_size, &parser, (void*) &render); +} + diff --git a/md2html/render_html.h b/md2html/render_html.h new file mode 100644 index 0000000..968dc8e --- /dev/null +++ b/md2html/render_html.h @@ -0,0 +1,66 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2017 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef MD4C_RENDER_HTML_H +#define MD4C_RENDER_HTML_H + +#include "md4c.h" + +#ifdef __cplusplus + extern "C" { +#endif + + +/* If set, debug output from md_parse() is sent to stderr. */ +#define MD_RENDER_FLAG_DEBUG 0x0001 +#define MD_RENDER_FLAG_VERBATIM_ENTITIES 0x0002 + + +/* Render Markdown into HTML. + * + * Note only contents of tag is generated. Caller must generate + * HTML header/footer manually before/after calling md_render_html(). + * + * Params input and input_size specify the Markdown input. + * Callback process_output() gets called with chunks of HTML output. + * (Typical implementation may just output the bytes to file or append to + * some buffer). + * Param userdata is just propgated back to process_output() callback. + * Param parser_flags are flags from md4c.h propagated to md_parse(). + * Param render_flags is bitmask of MD_RENDER_FLAG_xxxx. + * + * Returns -1 on error (if md_parse() fails.) + * Returns 0 on success. + */ +int md_render_html(const MD_CHAR* input, MD_SIZE input_size, + void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned parser_flags, unsigned renderer_flags); + + +#ifdef __cplusplus + } /* extern "C" { */ +#endif + +#endif /* MD4C_RENDER_HTML_H */ diff --git a/md4c/CMakeLists.txt b/md4c/CMakeLists.txt new file mode 100644 index 0000000..e72578d --- /dev/null +++ b/md4c/CMakeLists.txt @@ -0,0 +1,32 @@ +# Be sure to export all symbols in Windows. +set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS 1) + +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DDEBUG") + +set(md4c_src + md4c.c +) + +add_library(md4c ${md4c_src}) + +set_target_properties(md4c PROPERTIES + VERSION ${MD_VERSION} + SOVERSION ${MD_VERSION_MAJOR} + PUBLIC_HEADER md4c.h +) + +install( + TARGETS md4c + EXPORT md4cConfig + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +# Create a pkg-config file +configure_file(md4c.pc.in md4c.pc @ONLY) +install(FILES ${CMAKE_BINARY_DIR}/md4c/md4c.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + +# And a CMake file +install(EXPORT md4cConfig DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/md4c/) diff --git a/md4c/md4c.c b/md4c/md4c.c new file mode 100644 index 0000000..b0ef739 --- /dev/null +++ b/md4c/md4c.c @@ -0,0 +1,6309 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2020 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "md4c.h" + +#include +#include +#include +#include + + +/***************************** + *** Miscellaneous Stuff *** + *****************************/ + +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L + /* C89/90 or old compilers in general may not understand "inline". */ + #if defined __GNUC__ + #define inline __inline__ + #elif defined _MSC_VER + #define inline __inline + #else + #define inline + #endif +#endif + +/* Make the UTF-8 support the default. */ +#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16 + #define MD4C_USE_UTF8 +#endif + +/* Magic for making wide literals with MD4C_USE_UTF16. */ +#ifdef _T + #undef _T +#endif +#if defined MD4C_USE_UTF16 + #define _T(x) L##x +#else + #define _T(x) x +#endif + +/* Misc. macros. */ +#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0])) + +#define STRINGIZE_(x) #x +#define STRINGIZE(x) STRINGIZE_(x) + +#ifndef TRUE + #define TRUE 1 + #define FALSE 0 +#endif + + +/************************ + *** Internal Types *** + ************************/ + +/* These are omnipresent so lets save some typing. */ +#define CHAR MD_CHAR +#define SZ MD_SIZE +#define OFF MD_OFFSET + +typedef struct MD_MARK_tag MD_MARK; +typedef struct MD_BLOCK_tag MD_BLOCK; +typedef struct MD_CONTAINER_tag MD_CONTAINER; +typedef struct MD_REF_DEF_tag MD_REF_DEF; + + +/* During analyzes of inline marks, we need to manage some "mark chains", + * of (yet unresolved) openers. This structure holds start/end of the chain. + * The chain internals are then realized through MD_MARK::prev and ::next. + */ +typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN; +struct MD_MARKCHAIN_tag { + int head; /* Index of first mark in the chain, or -1 if empty. */ + int tail; /* Index of last mark in the chain, or -1 if empty. */ +}; + +/* Context propagated through all the parsing. */ +typedef struct MD_CTX_tag MD_CTX; +struct MD_CTX_tag { + /* Immutable stuff (parameters of md_parse()). */ + const CHAR* text; + SZ size; + MD_PARSER parser; + void* userdata; + + /* When this is true, it allows some optimizations. */ + int doc_ends_with_newline; + + /* Helper temporary growing buffer. */ + CHAR* buffer; + unsigned alloc_buffer; + + /* Reference definitions. */ + MD_REF_DEF* ref_defs; + int n_ref_defs; + int alloc_ref_defs; + void** ref_def_hashtable; + int ref_def_hashtable_size; + + /* Stack of inline/span markers. + * This is only used for parsing a single block contents but by storing it + * here we may reuse the stack for subsequent blocks; i.e. we have fewer + * (re)allocations. */ + MD_MARK* marks; + int n_marks; + int alloc_marks; + +#if defined MD4C_USE_UTF16 + char mark_char_map[128]; +#else + char mark_char_map[256]; +#endif + + /* For resolving of inline spans. */ + MD_MARKCHAIN mark_chains[13]; +#define PTR_CHAIN ctx->mark_chains[0] +#define TABLECELLBOUNDARIES ctx->mark_chains[1] +#define ASTERISK_OPENERS_extraword_mod3_0 ctx->mark_chains[2] +#define ASTERISK_OPENERS_extraword_mod3_1 ctx->mark_chains[3] +#define ASTERISK_OPENERS_extraword_mod3_2 ctx->mark_chains[4] +#define ASTERISK_OPENERS_intraword_mod3_0 ctx->mark_chains[5] +#define ASTERISK_OPENERS_intraword_mod3_1 ctx->mark_chains[6] +#define ASTERISK_OPENERS_intraword_mod3_2 ctx->mark_chains[7] +#define UNDERSCORE_OPENERS ctx->mark_chains[8] +#define TILDE_OPENERS_1 ctx->mark_chains[9] +#define TILDE_OPENERS_2 ctx->mark_chains[10] +#define BRACKET_OPENERS ctx->mark_chains[11] +#define DOLLAR_OPENERS ctx->mark_chains[12] +#define OPENERS_CHAIN_FIRST 2 +#define OPENERS_CHAIN_LAST 12 + + int n_table_cell_boundaries; + + /* For resolving links. */ + int unresolved_link_head; + int unresolved_link_tail; + + /* For resolving raw HTML. */ + OFF html_comment_horizon; + OFF html_proc_instr_horizon; + OFF html_decl_horizon; + OFF html_cdata_horizon; + + /* For block analysis. + * Notes: + * -- It holds MD_BLOCK as well as MD_LINE structures. After each + * MD_BLOCK, its (multiple) MD_LINE(s) follow. + * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used + * instead of MD_LINE(s). + */ + void* block_bytes; + MD_BLOCK* current_block; + int n_block_bytes; + int alloc_block_bytes; + + /* For container block analysis. */ + MD_CONTAINER* containers; + int n_containers; + int alloc_containers; + + /* Minimal indentation to call the block "indented code block". */ + unsigned code_indent_offset; + + /* Contextual info for line analysis. */ + SZ code_fence_length; /* For checking closing fence length. */ + int html_block_type; /* For checking closing raw HTML condition. */ + int last_line_has_list_loosening_effect; + int last_list_item_starts_with_two_blank_lines; +}; + +enum MD_LINETYPE_tag { + MD_LINE_BLANK, + MD_LINE_HR, + MD_LINE_ATXHEADER, + MD_LINE_SETEXTHEADER, + MD_LINE_SETEXTUNDERLINE, + MD_LINE_INDENTEDCODE, + MD_LINE_FENCEDCODE, + MD_LINE_HTML, + MD_LINE_TEXT, + MD_LINE_TABLE, + MD_LINE_TABLEUNDERLINE +}; +typedef enum MD_LINETYPE_tag MD_LINETYPE; + +typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS; +struct MD_LINE_ANALYSIS_tag { + MD_LINETYPE type : 16; + unsigned data : 16; + OFF beg; + OFF end; + unsigned indent; /* Indentation level. */ +}; + +typedef struct MD_LINE_tag MD_LINE; +struct MD_LINE_tag { + OFF beg; + OFF end; +}; + +typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE; +struct MD_VERBATIMLINE_tag { + OFF beg; + OFF end; + OFF indent; +}; + + +/******************* + *** Debugging *** + *******************/ + +#define MD_LOG(msg) \ + do { \ + if(ctx->parser.debug_log != NULL) \ + ctx->parser.debug_log((msg), ctx->userdata); \ + } while(0) + +#ifdef DEBUG + #define MD_ASSERT(cond) \ + do { \ + if(!(cond)) { \ + MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \ + "Assertion '" STRINGIZE(cond) "' failed."); \ + exit(1); \ + } \ + } while(0) + + #define MD_UNREACHABLE() MD_ASSERT(1 == 0) +#else + #ifdef __GNUC__ + #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0) + #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0) + #elif defined _MSC_VER && _MSC_VER > 120 + #define MD_ASSERT(cond) do { __assume(cond); } while(0) + #define MD_UNREACHABLE() do { __assume(0); } while(0) + #else + #define MD_ASSERT(cond) do {} while(0) + #define MD_UNREACHABLE() do {} while(0) + #endif +#endif + + +/***************** + *** Helpers *** + *****************/ + +/* Character accessors. */ +#define CH(off) (ctx->text[(off)]) +#define STR(off) (ctx->text + (off)) + +/* Check whether the pointer points into ctx->text. */ +#define IS_INPUT_STR(ptr) (ctx->text <= (ptr) && (ptr) < (ctx->text + ctx->size)) + +/* Character classification. + * Note we assume ASCII compatibility of code points < 128 here. */ +#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max)) +#define ISANYOF_(ch, palette) (md_strchr((palette), (ch)) != NULL) +#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2)) +#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3)) +#define ISASCII_(ch) ((unsigned)(ch) <= 127) +#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t'))) +#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n'))) +#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f'))) +#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127) +#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126)) +#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z'))) +#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z'))) +#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch)) +#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9'))) +#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f'))) +#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch)) + +#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette)) +#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2)) +#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3)) +#define ISASCII(off) ISASCII_(CH(off)) +#define ISBLANK(off) ISBLANK_(CH(off)) +#define ISNEWLINE(off) ISNEWLINE_(CH(off)) +#define ISWHITESPACE(off) ISWHITESPACE_(CH(off)) +#define ISCNTRL(off) ISCNTRL_(CH(off)) +#define ISPUNCT(off) ISPUNCT_(CH(off)) +#define ISUPPER(off) ISUPPER_(CH(off)) +#define ISLOWER(off) ISLOWER_(CH(off)) +#define ISALPHA(off) ISALPHA_(CH(off)) +#define ISDIGIT(off) ISDIGIT_(CH(off)) +#define ISXDIGIT(off) ISXDIGIT_(CH(off)) +#define ISALNUM(off) ISALNUM_(CH(off)) + + +#if defined MD4C_USE_UTF16 + #define md_strchr wcschr +#else + #define md_strchr strchr +#endif + + +/* Case insensitive check of string equality. */ +static inline int +md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n) +{ + OFF i; + for(i = 0; i < n; i++) { + CHAR ch1 = s1[i]; + CHAR ch2 = s2[i]; + + if(ISLOWER_(ch1)) + ch1 += ('A'-'a'); + if(ISLOWER_(ch2)) + ch2 += ('A'-'a'); + if(ch1 != ch2) + return FALSE; + } + return TRUE; +} + +static inline int +md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n) +{ + return memcmp(s1, s2, n * sizeof(CHAR)) == 0; +} + +static int +md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size) +{ + OFF off = 0; + int ret = 0; + + while(1) { + while(off < size && str[off] != _T('\0')) + off++; + + if(off > 0) { + ret = ctx->parser.text(type, str, off, ctx->userdata); + if(ret != 0) + return ret; + + str += off; + size -= off; + off = 0; + } + + if(off >= size) + return 0; + + ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata); + if(ret != 0) + return ret; + off++; + } +} + + +#define MD_CHECK(func) \ + do { \ + ret = (func); \ + if(ret < 0) \ + goto abort; \ + } while(0) + + +#define MD_TEMP_BUFFER(sz) \ + do { \ + if(sz > ctx->alloc_buffer) { \ + CHAR* new_buffer; \ + SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \ + \ + new_buffer = realloc(ctx->buffer, new_size); \ + if(new_buffer == NULL) { \ + MD_LOG("realloc() failed."); \ + ret = -1; \ + goto abort; \ + } \ + \ + ctx->buffer = new_buffer; \ + ctx->alloc_buffer = new_size; \ + } \ + } while(0) + + +#define MD_ENTER_BLOCK(type, arg) \ + do { \ + ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \ + if(ret != 0) { \ + MD_LOG("Aborted from enter_block() callback."); \ + goto abort; \ + } \ + } while(0) + +#define MD_LEAVE_BLOCK(type, arg) \ + do { \ + ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \ + if(ret != 0) { \ + MD_LOG("Aborted from leave_block() callback."); \ + goto abort; \ + } \ + } while(0) + +#define MD_ENTER_SPAN(type, arg) \ + do { \ + ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \ + if(ret != 0) { \ + MD_LOG("Aborted from enter_span() callback."); \ + goto abort; \ + } \ + } while(0) + +#define MD_LEAVE_SPAN(type, arg) \ + do { \ + ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \ + if(ret != 0) { \ + MD_LOG("Aborted from leave_span() callback."); \ + goto abort; \ + } \ + } while(0) + +#define MD_TEXT(type, str, size) \ + do { \ + if(size > 0) { \ + ret = ctx->parser.text((type), (str), (size), ctx->userdata); \ + if(ret != 0) { \ + MD_LOG("Aborted from text() callback."); \ + goto abort; \ + } \ + } \ + } while(0) + +#define MD_TEXT_INSECURE(type, str, size) \ + do { \ + if(size > 0) { \ + ret = md_text_with_null_replacement(ctx, type, str, size); \ + if(ret != 0) { \ + MD_LOG("Aborted from text() callback."); \ + goto abort; \ + } \ + } \ + } while(0) + + + +/************************* + *** Unicode Support *** + *************************/ + +typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO; +struct MD_UNICODE_FOLD_INFO_tag { + unsigned codepoints[3]; + int n_codepoints; +}; + + +#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8 + /* Binary search over sorted "map" of codepoints. Consecutive sequences + * of codepoints may be encoded in the map by just using the + * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000). + * + * Returns index of the found record in the map (in the case of ranges, + * the minimal value is used); or -1 on failure. */ + static int + md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size) + { + int beg, end; + int pivot_beg, pivot_end; + + beg = 0; + end = (int) map_size-1; + while(beg <= end) { + /* Pivot may be a range, not just a single value. */ + pivot_beg = pivot_end = (beg + end) / 2; + if(map[pivot_end] & 0x40000000) + pivot_end++; + if(map[pivot_beg] & 0x80000000) + pivot_beg--; + + if(codepoint < (map[pivot_beg] & 0x00ffffff)) + end = pivot_beg - 1; + else if(codepoint > (map[pivot_end] & 0x00ffffff)) + beg = pivot_end + 1; + else + return pivot_beg; + } + + return -1; + } + + static int + md_is_unicode_whitespace__(unsigned codepoint) + { +#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) +#define S(cp) (cp) + /* Unicode "Zs" category. + * (generated by scripts/build_whitespace_map.py) */ + static const unsigned WHITESPACE_MAP[] = { + S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000) + }; +#undef R +#undef S + + /* The ASCII ones are the most frequently used ones, also CommonMark + * specification requests few more in this range. */ + if(codepoint <= 0x7f) + return ISWHITESPACE_(codepoint); + + return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0); + } + + static int + md_is_unicode_punct__(unsigned codepoint) + { +#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) +#define S(cp) (cp) + /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. + * (generated by scripts/build_punct_map.py) */ + static const unsigned PUNCT_MAP[] = { + R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040), + R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7), + S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0), + S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f), + R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e), + R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f), + R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4), + R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c), + R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a), + R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60), + R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027), + R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e), + R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef), + R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70), + R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f), S(0x3030), + S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e), + R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f), + S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1), + S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68), + R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b), + R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102), + S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f), + R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), R(0x10f55,0x10f59), R(0x11047,0x1104d), + R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175), R(0x111c5,0x111c8), + S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9), R(0x1144b,0x1144f), + S(0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643), R(0x11660,0x1166c), + R(0x1173c,0x1173e), S(0x1183b), S(0x119e2), R(0x11a3f,0x11a46), R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), + R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), S(0x11fff), R(0x12470,0x12474), + R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44), R(0x16e97,0x16e9a), S(0x16fe2), + S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f) + }; +#undef R +#undef S + + /* The ASCII ones are the most frequently used ones, also CommonMark + * specification requests few more in this range. */ + if(codepoint <= 0x7f) + return ISPUNCT_(codepoint); + + return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0); + } + + static void + md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) + { +#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) +#define S(cp) (cp) + /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. + * (generated by scripts/build_punct_map.py) */ + static const unsigned FOLD_MAP_1[] = { + R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136), + R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182), + S(0x0186), S(0x0187), S(0x0189), S(0x018b), S(0x018e), S(0x018f), S(0x0190), S(0x0191), S(0x0193), + S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f), R(0x01a0,0x01a4), S(0x01a6), + S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b3), S(0x01b7), S(0x01b8), + S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8), S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), + S(0x01f1), S(0x01f2), S(0x01f6), S(0x01f7), R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), + S(0x023b), S(0x023d), S(0x023e), S(0x0241), S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), + S(0x0370), S(0x0376), S(0x037f), S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), R(0x0391,0x03a1), + R(0x03a3,0x03ab), S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), + S(0x03f0), S(0x03f1), S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), + R(0x0400,0x040f), R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), + R(0x04d0,0x052e), R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), + S(0x1c81), S(0x1c82), S(0x1c83), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba), + R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d), + R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f), + R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fba), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8), S(0x1fda), S(0x1fe8), + S(0x1fea), S(0x1fec), S(0x1ff8), S(0x1ffa), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), + S(0x2183), R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), + R(0x2c67,0x2c6b), S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), + R(0x2c80,0x2ce2), S(0x2ceb), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), + R(0xa732,0xa76e), S(0xa779), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), + R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2), + S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), R(0xab70,0xabbf), + R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2), R(0x118a0,0x118bf), + R(0x16e40,0x16e5f), R(0x1e900,0x1e921) + }; + static const unsigned FOLD_MAP_1_DATA[] = { + 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148, + 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0254, 0x0188, 0x0256, 0x018c, 0x01dd, + 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275, 0x01a1, 0x01a5, + 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x01b4, 0x0292, 0x01b9, 0x01bd, 0x01c6, 0x01c6, + 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3, 0x0195, 0x01bf, 0x01f9, 0x021f, + 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242, 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, + 0x03b9, 0x0371, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af, 0x03cc, 0x03cd, 0x03b1, 0x03c1, 0x03c3, 0x03cb, + 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0, 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, + 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f, 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, + 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586, 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, + 0x0434, 0x043e, 0x0441, 0x0442, 0x044a, 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, + 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07, 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, + 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60, 0x1f67, 0x1fb0, 0x1f70, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1f76, + 0x1fe0, 0x1f7a, 0x1fe5, 0x1f78, 0x1f7c, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170, 0x217f, 0x2184, 0x24d0, + 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251, 0x0271, 0x0250, 0x0252, + 0x2c73, 0x2c76, 0x023f, 0x2c81, 0x2ce3, 0x2cec, 0x2cf3, 0xa641, 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, + 0xa733, 0xa76f, 0xa77a, 0x1d79, 0xa77f, 0xa787, 0xa78c, 0x0265, 0xa791, 0xa797, 0xa7a9, 0x0266, 0x025c, + 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d, 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, + 0x13a0, 0x13ef, 0xff41, 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, + 0x16e60, 0x16e7f, 0x1e922, 0x1e943 + }; + static const unsigned FOLD_MAP_2[] = { + S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99), + S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f), + R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2), + S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3), + S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13), + S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17) + }; + static const unsigned FOLD_MAP_2_DATA[] = { + 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308, + 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9, + 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9, + 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342, + 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342, + 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9, + 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565, + 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d + }; + static const unsigned FOLD_MAP_3[] = { + S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3), + S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04) + }; + static const unsigned FOLD_MAP_3_DATA[] = { + 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301, + 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300, + 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301, + 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c + }; +#undef R +#undef S + static const struct { + const unsigned* map; + const unsigned* data; + size_t map_size; + int n_codepoints; + } FOLD_MAP_LIST[] = { + { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 }, + { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 }, + { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 } + }; + + int i; + + /* Fast path for ASCII characters. */ + if(codepoint <= 0x7f) { + info->codepoints[0] = codepoint; + if(ISUPPER_(codepoint)) + info->codepoints[0] += 'a' - 'A'; + info->n_codepoints = 1; + return; + } + + /* Try to locate the codepoint in any of the maps. */ + for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) { + int index; + + index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size); + if(index >= 0) { + /* Found the mapping. */ + int n_codepoints = FOLD_MAP_LIST[i].n_codepoints; + const unsigned* map = FOLD_MAP_LIST[i].map; + const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints); + + memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints); + info->n_codepoints = n_codepoints; + + if(FOLD_MAP_LIST[i].map[index] != codepoint) { + /* The found mapping maps whole range of codepoints, + * i.e. we have to offset info->codepoints[0] accordingly. */ + if((map[index] & 0x00ffffff)+1 == codepoints[0]) { + /* Alternating type of the range. */ + info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0); + } else { + /* Range to range kind of mapping. */ + info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff)); + } + } + + return; + } + } + + /* No mapping found. Map the codepoint to itself. */ + info->codepoints[0] = codepoint; + info->n_codepoints = 1; + } +#endif + + +#if defined MD4C_USE_UTF16 + #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800) + #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00) + #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))) + + static unsigned + md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) + { + if(IS_UTF16_SURROGATE_HI(str[0])) { + if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) { + if(p_size != NULL) + *p_size = 2; + return UTF16_DECODE_SURROGATE(str[0], str[1]); + } + } + + if(p_size != NULL) + *p_size = 1; + return str[0]; + } + + static unsigned + md_decode_utf16le_before__(MD_CTX* ctx, OFF off) + { + if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1))) + return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1)); + + return CH(off); + } + + /* No whitespace uses surrogates, so no decoding needed here. */ + #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) + #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off)) + #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1)) + + #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off)) + + static inline int + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) + { + return md_decode_utf16le__(str+off, str_size-off, p_char_size); + } +#elif defined MD4C_USE_UTF8 + #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f) + #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0) + #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0) + #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0) + #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80) + + static unsigned + md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size) + { + if(!IS_UTF8_LEAD1(str[0])) { + if(IS_UTF8_LEAD2(str[0])) { + if(1 < str_size && IS_UTF8_TAIL(str[1])) { + if(p_size != NULL) + *p_size = 2; + + return (((unsigned int)str[0] & 0x1f) << 6) | + (((unsigned int)str[1] & 0x3f) << 0); + } + } else if(IS_UTF8_LEAD3(str[0])) { + if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) { + if(p_size != NULL) + *p_size = 3; + + return (((unsigned int)str[0] & 0x0f) << 12) | + (((unsigned int)str[1] & 0x3f) << 6) | + (((unsigned int)str[2] & 0x3f) << 0); + } + } else if(IS_UTF8_LEAD4(str[0])) { + if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) { + if(p_size != NULL) + *p_size = 4; + + return (((unsigned int)str[0] & 0x07) << 18) | + (((unsigned int)str[1] & 0x3f) << 12) | + (((unsigned int)str[2] & 0x3f) << 6) | + (((unsigned int)str[3] & 0x3f) << 0); + } + } + } + + if(p_size != NULL) + *p_size = 1; + return (unsigned) str[0]; + } + + static unsigned + md_decode_utf8_before__(MD_CTX* ctx, OFF off) + { + if(!IS_UTF8_LEAD1(CH(off-1))) { + if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-2) & 0x1f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + + if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-3) & 0x0f) << 12) | + (((unsigned int)CH(off-2) & 0x3f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + + if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-4) & 0x07) << 18) | + (((unsigned int)CH(off-3) & 0x3f) << 12) | + (((unsigned int)CH(off-2) & 0x3f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + } + + return (unsigned) CH(off-1); + } + + #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) + #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off)) + + #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) + + static inline unsigned + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) + { + return md_decode_utf8__(str+off, str_size-off, p_char_size); + } +#else + #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint) + #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) + #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) + + #define ISUNICODEPUNCT(off) ISPUNCT(off) + #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) + + static inline void + md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) + { + info->codepoints[0] = codepoint; + if(ISUPPER_(codepoint)) + info->codepoints[0] += 'a' - 'A'; + info->n_codepoints = 1; + } + + static inline unsigned + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size) + { + *p_size = 1; + return (unsigned) str[off]; + } +#endif + + +/************************************* + *** Helper string manipulations *** + *************************************/ + +/* Fill buffer with copy of the string between 'beg' and 'end' but replace any + * line breaks with given replacement character. + * + * NOTE: Caller is responsible to make sure the buffer is large enough. + * (Given the output is always shorter then input, (end - beg) is good idea + * what the caller should allocate.) + */ +static void +md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, + CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size) +{ + CHAR* ptr = buffer; + int line_index = 0; + OFF off = beg; + + while(1) { + const MD_LINE* line = &lines[line_index]; + OFF line_end = line->end; + if(end < line_end) + line_end = end; + + while(off < line_end) { + *ptr = CH(off); + ptr++; + off++; + } + + if(off >= end) { + *p_size = ptr - buffer; + return; + } + + *ptr = line_break_replacement_char; + ptr++; + + line_index++; + off = lines[line_index].beg; + } +} + +/* Wrapper of md_merge_lines() which allocates new buffer for the output string. + */ +static int +md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, + CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size) +{ + CHAR* buffer; + + buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg)); + if(buffer == NULL) { + MD_LOG("malloc() failed."); + return -1; + } + + md_merge_lines(ctx, beg, end, lines, n_lines, + line_break_replacement_char, buffer, p_size); + + *p_str = buffer; + return 0; +} + +static OFF +md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size) +{ + SZ char_size; + unsigned codepoint; + + while(off < size) { + codepoint = md_decode_unicode(label, off, size, &char_size); + if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off])) + break; + off += char_size; + } + + return off; +} + + +/****************************** + *** Recognizing raw HTML *** + ******************************/ + +/* md_is_html_tag() may be called when processing inlines (inline raw HTML) + * or when breaking document to blocks (checking for start of HTML block type 7). + * + * When breaking document to blocks, we do not yet know line boundaries, but + * in that case the whole tag has to live on a single line. We distinguish this + * by n_lines == 0. + */ +static int +md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + int attr_state; + OFF off = beg; + OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size; + int i = 0; + + MD_ASSERT(CH(beg) == _T('<')); + + if(off + 1 >= line_end) + return FALSE; + off++; + + /* For parsing attributes, we need a little state automaton below. + * State -1: no attributes are allowed. + * State 0: attribute could follow after some whitespace. + * State 1: after a whitespace (attribute name may follow). + * State 2: after attribute name ('=' MAY follow). + * State 3: after '=' (value specification MUST follow). + * State 41: in middle of unquoted attribute value. + * State 42: in middle of single-quoted attribute value. + * State 43: in middle of double-quoted attribute value. + */ + attr_state = 0; + + if(CH(off) == _T('/')) { + /* Closer tag "". No attributes may be present. */ + attr_state = -1; + off++; + } + + /* Tag name */ + if(off >= line_end || !ISALPHA(off)) + return FALSE; + off++; + while(off < line_end && (ISALNUM(off) || CH(off) == _T('-'))) + off++; + + /* (Optional) attributes (if not closer), (optional) '/' (if not closer) + * and final '>'. */ + while(1) { + while(off < line_end && !ISNEWLINE(off)) { + if(attr_state > 40) { + if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) { + attr_state = 0; + off--; /* Put the char back for re-inspection in the new state. */ + } else if(attr_state == 42 && CH(off) == _T('\'')) { + attr_state = 0; + } else if(attr_state == 43 && CH(off) == _T('"')) { + attr_state = 0; + } + off++; + } else if(ISWHITESPACE(off)) { + if(attr_state == 0) + attr_state = 1; + off++; + } else if(attr_state <= 2 && CH(off) == _T('>')) { + /* End. */ + goto done; + } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) { + /* End with digraph '/>' */ + off++; + goto done; + } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) { + off++; + /* Attribute name */ + while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-")))) + off++; + attr_state = 2; + } else if(attr_state == 2 && CH(off) == _T('=')) { + /* Attribute assignment sign */ + off++; + attr_state = 3; + } else if(attr_state == 3) { + /* Expecting start of attribute value. */ + if(CH(off) == _T('"')) + attr_state = 43; + else if(CH(off) == _T('\'')) + attr_state = 42; + else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off)) + attr_state = 41; + else + return FALSE; + off++; + } else { + /* Anything unexpected. */ + return FALSE; + } + } + + /* We have to be on a single line. See definition of start condition + * of HTML block, type 7. */ + if(n_lines == 0) + return FALSE; + + i++; + if(i >= n_lines) + return FALSE; + + off = lines[i].beg; + line_end = lines[i].end; + + if(attr_state == 0 || attr_state == 41) + attr_state = 1; + + if(off >= max_end) + return FALSE; + } + +done: + if(off >= max_end) + return FALSE; + + *p_end = off+1; + return TRUE; +} + +static int +md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len, + const MD_LINE* lines, int n_lines, + OFF beg, OFF max_end, OFF* p_end, + OFF* p_scan_horizon) +{ + OFF off = beg; + int i = 0; + + if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) { + /* We have already scanned the range up to the max_end so we know + * there is nothing to see. */ + return FALSE; + } + + while(TRUE) { + while(off + len <= lines[i].end && off + len <= max_end) { + if(md_ascii_eq(STR(off), str, len)) { + /* Success. */ + *p_end = off + len; + return TRUE; + } + off++; + } + + i++; + if(off >= max_end || i >= n_lines) { + /* Failure. */ + *p_scan_horizon = off; + return FALSE; + } + + off = lines[i].beg; + } +} + +static int +md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + MD_ASSERT(CH(beg) == _T('<')); + + if(off + 4 >= lines[0].end) + return FALSE; + if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-')) + return FALSE; + off += 4; + + /* ">" and "->" must not follow the opening. */ + if(off < lines[0].end && CH(off) == _T('>')) + return FALSE; + if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>')) + return FALSE; + + /* HTML comment must not contain "--", so we scan just for "--" instead + * of "-->" and verify manually that '>' follows. */ + if(md_scan_for_html_closer(ctx, _T("--"), 2, + lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon)) + { + if(*p_end < max_end && CH(*p_end) == _T('>')) { + *p_end = *p_end + 1; + return TRUE; + } + } + + return FALSE; +} + +static int +md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + if(off + 2 >= lines[0].end) + return FALSE; + if(CH(off+1) != _T('?')) + return FALSE; + off += 2; + + return md_scan_for_html_closer(ctx, _T("?>"), 2, + lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon); +} + +static int +md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + if(off + 2 >= lines[0].end) + return FALSE; + if(CH(off+1) != _T('!')) + return FALSE; + off += 2; + + /* Declaration name. */ + if(off >= lines[0].end || !ISALPHA(off)) + return FALSE; + off++; + while(off < lines[0].end && ISALPHA(off)) + off++; + if(off < lines[0].end && !ISWHITESPACE(off)) + return FALSE; + + return md_scan_for_html_closer(ctx, _T(">"), 1, + lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon); +} + +static int +md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + static const CHAR open_str[] = _T("= lines[0].end) + return FALSE; + if(memcmp(STR(off), open_str, open_size) != 0) + return FALSE; + off += open_size; + + if(lines[n_lines-1].end < max_end) + max_end = lines[n_lines-1].end - 2; + + return md_scan_for_html_closer(ctx, _T("]]>"), 3, + lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon); +} + +static int +md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) +{ + MD_ASSERT(CH(beg) == _T('<')); + return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) || + md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) || + md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) || + md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) || + md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end)); +} + + +/**************************** + *** Recognizing Entity *** + ****************************/ + +static int +md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8) + off++; + + if(1 <= off - beg && off - beg <= 6) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8) + off++; + + if(1 <= off - beg && off - beg <= 7) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + if(off < max_end && ISALPHA_(text[off])) + off++; + else + return FALSE; + + while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48) + off++; + + if(2 <= off - beg && off - beg <= 48) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + int is_contents; + OFF off = beg; + + MD_ASSERT(text[off] == _T('&')); + off++; + + if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X'))) + is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off); + else if(off+1 < max_end && text[off] == _T('#')) + is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off); + else + is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off); + + if(is_contents && off < max_end && text[off] == _T(';')) { + *p_end = off+1; + return TRUE; + } else { + return FALSE; + } +} + +static inline int +md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) +{ + return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end); +} + + +/****************************** + *** Attribute Management *** + ******************************/ + +typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD; +struct MD_ATTRIBUTE_BUILD_tag { + CHAR* text; + MD_TEXTTYPE* substr_types; + OFF* substr_offsets; + int substr_count; + int substr_alloc; + MD_TEXTTYPE trivial_types[1]; + OFF trivial_offsets[2]; +}; + + +#define MD_BUILD_ATTR_NO_ESCAPES 0x0001 + +static int +md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build, + MD_TEXTTYPE type, OFF off) +{ + if(build->substr_count >= build->substr_alloc) { + MD_TEXTTYPE* new_substr_types; + OFF* new_substr_offsets; + + build->substr_alloc = (build->substr_alloc > 0 + ? build->substr_alloc + build->substr_alloc / 2 + : 8); + new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types, + build->substr_alloc * sizeof(MD_TEXTTYPE)); + if(new_substr_types == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + /* Note +1 to reserve space for final offset (== raw_size). */ + new_substr_offsets = (OFF*) realloc(build->substr_offsets, + (build->substr_alloc+1) * sizeof(OFF)); + if(new_substr_offsets == NULL) { + MD_LOG("realloc() failed."); + free(new_substr_types); + return -1; + } + + build->substr_types = new_substr_types; + build->substr_offsets = new_substr_offsets; + } + + build->substr_types[build->substr_count] = type; + build->substr_offsets[build->substr_count] = off; + build->substr_count++; + return 0; +} + +static void +md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) +{ + if(build->substr_alloc > 0) { + free(build->text); + free(build->substr_types); + free(build->substr_offsets); + } +} + +static int +md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, + unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) +{ + OFF raw_off, off; + int is_trivial; + int ret = 0; + + memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); + + /* If there is no backslash and no ampersand, build trivial attribute + * without any malloc(). */ + is_trivial = TRUE; + for(raw_off = 0; raw_off < raw_size; raw_off++) { + if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) { + is_trivial = FALSE; + break; + } + } + + if(is_trivial) { + build->text = (CHAR*) (raw_size ? raw_text : NULL); + build->substr_types = build->trivial_types; + build->substr_offsets = build->trivial_offsets; + build->substr_count = 1; + build->substr_alloc = 0; + build->trivial_types[0] = MD_TEXT_NORMAL; + build->trivial_offsets[0] = 0; + build->trivial_offsets[1] = raw_size; + off = raw_size; + } else { + build->text = (CHAR*) malloc(raw_size * sizeof(CHAR)); + if(build->text == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + + raw_off = 0; + off = 0; + + while(raw_off < raw_size) { + if(raw_text[raw_off] == _T('\0')) { + MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off)); + memcpy(build->text + off, raw_text + raw_off, 1); + off++; + raw_off++; + continue; + } + + if(raw_text[raw_off] == _T('&')) { + OFF ent_end; + + if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) { + MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off)); + memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off); + off += ent_end - raw_off; + raw_off = ent_end; + continue; + } + } + + if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL) + MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off)); + + if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) && + raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size && + (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1]))) + raw_off++; + + build->text[off++] = raw_text[raw_off++]; + } + build->substr_offsets[build->substr_count] = off; + } + + attr->text = build->text; + attr->size = off; + attr->substr_offsets = build->substr_offsets; + attr->substr_types = build->substr_types; + return 0; + +abort: + md_free_attribute(ctx, build); + return -1; +} + + +/********************************************* + *** Dictionary of Reference Definitions *** + *********************************************/ + +#define MD_FNV1A_BASE 2166136261U +#define MD_FNV1A_PRIME 16777619U + +static inline unsigned +md_fnv1a(unsigned base, const void* data, size_t n) +{ + const unsigned char* buf = (const unsigned char*) data; + unsigned hash = base; + size_t i; + + for(i = 0; i < n; i++) { + hash ^= buf[i]; + hash *= MD_FNV1A_PRIME; + } + + return hash; +} + + +struct MD_REF_DEF_tag { + CHAR* label; + CHAR* title; + unsigned hash; + SZ label_size; + SZ title_size; + OFF dest_beg; + OFF dest_end; +}; + +/* Label equivalence is quite complicated with regards to whitespace and case + * folding. This complicates computing a hash of it as well as direct comparison + * of two labels. */ + +static unsigned +md_link_label_hash(const CHAR* label, SZ size) +{ + unsigned hash = MD_FNV1A_BASE; + OFF off; + unsigned codepoint; + int is_whitespace = FALSE; + + off = md_skip_unicode_whitespace(label, 0, size); + while(off < size) { + SZ char_size; + + codepoint = md_decode_unicode(label, off, size, &char_size); + is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]); + + if(is_whitespace) { + codepoint = ' '; + hash = md_fnv1a(hash, &codepoint, sizeof(unsigned)); + off = md_skip_unicode_whitespace(label, off, size); + } else { + MD_UNICODE_FOLD_INFO fold_info; + + md_get_unicode_fold_info(codepoint, &fold_info); + hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned)); + off += char_size; + } + } + + return hash; +} + +static OFF +md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size, + MD_UNICODE_FOLD_INFO* fold_info) +{ + unsigned codepoint; + SZ char_size; + + if(off >= size) { + /* Treat end of a link label as a whitespace. */ + goto whitespace; + } + + if(ISNEWLINE_(label[off])) { + /* Treat new lines as a whitespace. */ + off++; + goto whitespace; + } + + codepoint = md_decode_unicode(label, off, size, &char_size); + off += char_size; + if(ISUNICODEWHITESPACE_(codepoint)) { + /* Treat all whitespace as equivalent */ + goto whitespace; + } + + /* Get real folding info. */ + md_get_unicode_fold_info(codepoint, fold_info); + return off; + +whitespace: + fold_info->codepoints[0] = _T(' '); + fold_info->n_codepoints = 1; + return md_skip_unicode_whitespace(label, off, size); +} + +static int +md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size) +{ + OFF a_off; + OFF b_off; + int a_reached_end = FALSE; + int b_reached_end = FALSE; + MD_UNICODE_FOLD_INFO a_fi = { 0 }; + MD_UNICODE_FOLD_INFO b_fi = { 0 }; + OFF a_fi_off = 0; + OFF b_fi_off = 0; + int cmp; + + a_off = md_skip_unicode_whitespace(a_label, 0, a_size); + b_off = md_skip_unicode_whitespace(b_label, 0, b_size); + while(!a_reached_end || !b_reached_end) { + /* If needed, load fold info for next char. */ + if(a_fi_off >= a_fi.n_codepoints) { + a_fi_off = 0; + a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi); + a_reached_end = (a_off >= a_size); + } + if(b_fi_off >= b_fi.n_codepoints) { + b_fi_off = 0; + b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi); + b_reached_end = (b_off >= b_size); + } + + cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off]; + if(cmp != 0) + return cmp; + + a_fi_off++; + b_fi_off++; + } + + return 0; +} + +typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST; +struct MD_REF_DEF_LIST_tag { + int n_ref_defs; + int alloc_ref_defs; + MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */ +}; + +static int +md_ref_def_cmp(const void* a, const void* b) +{ + const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; + const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; + + if(a_ref->hash < b_ref->hash) + return -1; + else if(a_ref->hash > b_ref->hash) + return +1; + else + return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size); +} + +static int +md_ref_def_cmp_for_sort(const void* a, const void* b) +{ + int cmp; + + cmp = md_ref_def_cmp(a, b); + + /* Ensure stability of the sorting. */ + if(cmp == 0) { + const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; + const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; + + if(a_ref < b_ref) + cmp = -1; + else if(a_ref > b_ref) + cmp = +1; + else + cmp = 0; + } + + return cmp; +} + +static int +md_build_ref_def_hashtable(MD_CTX* ctx) +{ + int i, j; + + if(ctx->n_ref_defs == 0) + return 0; + + ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4; + ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*)); + if(ctx->ref_def_hashtable == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*)); + + /* Each member of ctx->ref_def_hashtable[] can be: + * -- NULL, + * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or + * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to + * such MD_REF_DEFs. + */ + for(i = 0; i < ctx->n_ref_defs; i++) { + MD_REF_DEF* def = &ctx->ref_defs[i]; + void* bucket; + MD_REF_DEF_LIST* list; + + def->hash = md_link_label_hash(def->label, def->label_size); + bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size]; + + if(bucket == NULL) { + /* The bucket is empty. Make it just point to the def. */ + ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def; + continue; + } + + if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { + /* The bucket already contains one ref. def. Lets see whether it + * is the same label (ref. def. duplicate) or different one + * (hash conflict). */ + MD_REF_DEF* old_def = (MD_REF_DEF*) bucket; + + if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) { + /* Duplicate label: Ignore this ref. def. */ + continue; + } + + /* Make the bucket complex, i.e. able to hold more ref. defs. */ + list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*)); + if(list == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + list->ref_defs[0] = old_def; + list->ref_defs[1] = def; + list->n_ref_defs = 2; + list->alloc_ref_defs = 2; + ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; + continue; + } + + /* Append the def to the complex bucket list. + * + * Note in this case we ignore potential duplicates to avoid expensive + * iterating over the complex bucket. Below, we revisit all the complex + * buckets and handle it more cheaply after the complex bucket contents + * is sorted. */ + list = (MD_REF_DEF_LIST*) bucket; + if(list->n_ref_defs >= list->alloc_ref_defs) { + int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2; + MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list, + sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*)); + if(list_tmp == NULL) { + MD_LOG("realloc() failed."); + goto abort; + } + list = list_tmp; + list->alloc_ref_defs = alloc_ref_defs; + ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; + } + + list->ref_defs[list->n_ref_defs] = def; + list->n_ref_defs++; + } + + /* Sort the complex buckets so we can use bsearch() with them. */ + for(i = 0; i < ctx->ref_def_hashtable_size; i++) { + void* bucket = ctx->ref_def_hashtable[i]; + MD_REF_DEF_LIST* list; + + if(bucket == NULL) + continue; + if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) + continue; + + list = (MD_REF_DEF_LIST*) bucket; + qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort); + + /* Disable all duplicates in the complex bucket by forcing all such + * records to point to the 1st such ref. def. I.e. no matter which + * record is found during the lookup, it will always point to the right + * ref. def. in ctx->ref_defs[]. */ + for(j = 1; j < list->n_ref_defs; j++) { + if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0) + list->ref_defs[j] = list->ref_defs[j-1]; + } + } + + return 0; + +abort: + return -1; +} + +static void +md_free_ref_def_hashtable(MD_CTX* ctx) +{ + if(ctx->ref_def_hashtable != NULL) { + int i; + + for(i = 0; i < ctx->ref_def_hashtable_size; i++) { + void* bucket = ctx->ref_def_hashtable[i]; + if(bucket == NULL) + continue; + if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) + continue; + free(bucket); + } + + free(ctx->ref_def_hashtable); + } +} + +static const MD_REF_DEF* +md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size) +{ + unsigned hash; + void* bucket; + + if(ctx->ref_def_hashtable_size == 0) + return NULL; + + hash = md_link_label_hash(label, label_size); + bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size]; + + if(bucket == NULL) { + return NULL; + } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { + const MD_REF_DEF* def = (MD_REF_DEF*) bucket; + + if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0) + return def; + else + return NULL; + } else { + MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket; + MD_REF_DEF key_buf; + const MD_REF_DEF* key = &key_buf; + const MD_REF_DEF** ret; + + key_buf.label = (CHAR*) label; + key_buf.label_size = label_size; + key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size); + + ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs, + list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp); + if(ret != NULL) + return *ret; + else + return NULL; + } +} + + +/*************************** + *** Recognizing Links *** + ***************************/ + +/* Note this code is partially shared between processing inlines and blocks + * as reference definitions and links share some helper parser functions. + */ + +typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR; +struct MD_LINK_ATTR_tag { + OFF dest_beg; + OFF dest_end; + + CHAR* title; + SZ title_size; + int title_needs_free; +}; + + +static int +md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, + OFF* p_end, int* p_beg_line_index, int* p_end_line_index, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + OFF contents_beg = 0; + OFF contents_end = 0; + int line_index = 0; + int len = 0; + + if(CH(off) != _T('[')) + return FALSE; + off++; + + while(1) { + OFF line_end = lines[line_index].end; + + while(off < line_end) { + if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { + if(contents_end == 0) { + contents_beg = off; + *p_beg_line_index = line_index; + } + contents_end = off + 2; + off += 2; + } else if(CH(off) == _T('[')) { + return FALSE; + } else if(CH(off) == _T(']')) { + if(contents_beg < contents_end) { + /* Success. */ + *p_contents_beg = contents_beg; + *p_contents_end = contents_end; + *p_end = off+1; + *p_end_line_index = line_index; + return TRUE; + } else { + /* Link label must have some non-whitespace contents. */ + return FALSE; + } + } else { + unsigned codepoint; + SZ char_size; + + codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size); + if(!ISUNICODEWHITESPACE_(codepoint)) { + if(contents_end == 0) { + contents_beg = off; + *p_beg_line_index = line_index; + } + contents_end = off + char_size; + } + + off += char_size; + } + + len++; + if(len > 999) + return FALSE; + } + + line_index++; + len++; + if(line_index < n_lines) + off = lines[line_index].beg; + else + break; + } + + return FALSE; +} + +static int +md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + + if(off >= max_end || CH(off) != _T('<')) + return FALSE; + off++; + + while(off < max_end) { + if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { + off += 2; + continue; + } + + if(ISNEWLINE(off) || CH(off) == _T('<')) + return FALSE; + + if(CH(off) == _T('>')) { + /* Success. */ + *p_contents_beg = beg+1; + *p_contents_end = off; + *p_end = off+1; + return TRUE; + } + + off++; + } + + return FALSE; +} + +static int +md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + int parenthesis_level = 0; + + while(off < max_end) { + if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { + off += 2; + continue; + } + + if(ISWHITESPACE(off) || ISCNTRL(off)) + break; + + /* Link destination may include balanced pairs of unescaped '(' ')'. + * Note we limit the maximal nesting level by 32 to protect us from + * https://github.com/jgm/cmark/issues/214 */ + if(CH(off) == _T('(')) { + parenthesis_level++; + if(parenthesis_level > 32) + return FALSE; + } else if(CH(off) == _T(')')) { + if(parenthesis_level == 0) + break; + parenthesis_level--; + } + + off++; + } + + if(parenthesis_level != 0 || off == beg) + return FALSE; + + /* Success. */ + *p_contents_beg = beg; + *p_contents_end = off; + *p_end = off; + return TRUE; +} + +static inline int +md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + if(CH(beg) == _T('<')) + return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); + else + return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); +} + +static int +md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, + OFF* p_end, int* p_beg_line_index, int* p_end_line_index, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + CHAR closer_char; + int line_index = 0; + + /* White space with up to one line break. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end) { + line_index++; + if(line_index >= n_lines) + return FALSE; + off = lines[line_index].beg; + } + if(off == beg) + return FALSE; + + *p_beg_line_index = line_index; + + /* First char determines how to detect end of it. */ + switch(CH(off)) { + case _T('"'): closer_char = _T('"'); break; + case _T('\''): closer_char = _T('\''); break; + case _T('('): closer_char = _T(')'); break; + default: return FALSE; + } + off++; + + *p_contents_beg = off; + + while(line_index < n_lines) { + OFF line_end = lines[line_index].end; + + while(off < line_end) { + if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { + off++; + } else if(CH(off) == closer_char) { + /* Success. */ + *p_contents_end = off; + *p_end = off+1; + *p_end_line_index = line_index; + return TRUE; + } else if(closer_char == _T(')') && CH(off) == _T('(')) { + /* ()-style title cannot contain (unescaped '(')) */ + return FALSE; + } + + off++; + } + + line_index++; + } + + return FALSE; +} + +/* Returns 0 if it is not a reference definition. + * + * Returns N > 0 if it is a reference definition. N then corresponds to the + * number of lines forming it). In this case the definition is stored for + * resolving any links referring to it. + * + * Returns -1 in case of an error (out of memory). + */ +static int +md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + OFF label_contents_beg; + OFF label_contents_end; + int label_contents_line_index = -1; + int label_is_multiline; + CHAR* label = NULL; + SZ label_size; + OFF dest_contents_beg; + OFF dest_contents_end; + OFF title_contents_beg; + OFF title_contents_end; + int title_contents_line_index; + int title_is_multiline; + OFF off; + int line_index = 0; + int tmp_line_index; + MD_REF_DEF* def; + int ret; + + /* Link label. */ + if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg, + &off, &label_contents_line_index, &line_index, + &label_contents_beg, &label_contents_end)) + return FALSE; + label_is_multiline = (label_contents_line_index != line_index); + + /* Colon. */ + if(off >= lines[line_index].end || CH(off) != _T(':')) + return FALSE; + off++; + + /* Optional white space with up to one line break. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end) { + line_index++; + if(line_index >= n_lines) + return FALSE; + off = lines[line_index].beg; + } + + /* Link destination. */ + if(!md_is_link_destination(ctx, off, lines[line_index].end, + &off, &dest_contents_beg, &dest_contents_end)) + return FALSE; + + /* (Optional) title. Note we interpret it as an title only if nothing + * more follows on its last line. */ + if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, + &off, &title_contents_line_index, &tmp_line_index, + &title_contents_beg, &title_contents_end) + && off >= lines[line_index + tmp_line_index].end) + { + title_is_multiline = (tmp_line_index != title_contents_line_index); + title_contents_line_index += line_index; + line_index += tmp_line_index; + } else { + /* Not a title. */ + title_is_multiline = FALSE; + title_contents_beg = off; + title_contents_end = off; + title_contents_line_index = 0; + } + + /* Nothing more can follow on the last line. */ + if(off < lines[line_index].end) + return FALSE; + + /* Construct label. */ + if(!label_is_multiline) { + label = (CHAR*) STR(label_contents_beg); + label_size = label_contents_end - label_contents_beg; + } else { + MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end, + lines + label_contents_line_index, n_lines - label_contents_line_index, + _T(' '), &label, &label_size)); + } + + /* Store the reference definition. */ + if(ctx->n_ref_defs >= ctx->alloc_ref_defs) { + MD_REF_DEF* new_defs; + + ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 + ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2 + : 16); + new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF)); + if(new_defs == NULL) { + MD_LOG("realloc() failed."); + goto abort; + } + + ctx->ref_defs = new_defs; + } + + def = &ctx->ref_defs[ctx->n_ref_defs]; + memset(def, 0, sizeof(MD_REF_DEF)); + + def->label = label; + def->label_size = label_size; + + def->dest_beg = dest_contents_beg; + def->dest_end = dest_contents_end; + + if(title_contents_beg >= title_contents_end) { + def->title = NULL; + def->title_size = 0; + } else if(!title_is_multiline) { + def->title = (CHAR*) STR(title_contents_beg); + def->title_size = title_contents_end - title_contents_beg; + } else { + MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, + lines + title_contents_line_index, n_lines - title_contents_line_index, + _T('\n'), &def->title, &def->title_size)); + } + + /* Success. */ + ctx->n_ref_defs++; + return line_index + 1; + +abort: + /* Failure. */ + if(!IS_INPUT_STR(label)) + free(label); + return ret; +} + +static int +md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + OFF beg, OFF end, MD_LINK_ATTR* attr) +{ + const MD_REF_DEF* def; + const MD_LINE* beg_line; + const MD_LINE* end_line; + CHAR* label; + SZ label_size; + int ret; + + MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!')); + MD_ASSERT(CH(end-1) == _T(']')); + + beg += (CH(beg) == _T('!') ? 2 : 1); + end--; + + /* Find lines corresponding to the beg and end positions. */ + MD_ASSERT(lines[0].beg <= beg); + beg_line = lines; + while(beg >= beg_line->end) + beg_line++; + + MD_ASSERT(end <= lines[n_lines-1].end); + end_line = beg_line; + while(end >= end_line->end) + end_line++; + + if(beg_line != end_line) { + MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line, + n_lines - (beg_line - lines), _T(' '), &label, &label_size)); + } else { + label = (CHAR*) STR(beg); + label_size = end - beg; + } + + def = md_lookup_ref_def(ctx, label, label_size); + if(def != NULL) { + attr->dest_beg = def->dest_beg; + attr->dest_end = def->dest_end; + attr->title = def->title; + attr->title_size = def->title_size; + attr->title_needs_free = FALSE; + } + + if(!IS_INPUT_STR(label)) + free(label); + + ret = (def != NULL); + +abort: + return ret; +} + +static int +md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + OFF beg, OFF* p_end, MD_LINK_ATTR* attr) +{ + int line_index = 0; + int tmp_line_index; + OFF title_contents_beg; + OFF title_contents_end; + int title_contents_line_index; + int title_is_multiline; + OFF off = beg; + int ret = FALSE; + + while(off >= lines[line_index].end) + line_index++; + + MD_ASSERT(CH(off) == _T('(')); + off++; + + /* Optional white space with up to one line break. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end && ISNEWLINE(off)) { + line_index++; + if(line_index >= n_lines) + return FALSE; + off = lines[line_index].beg; + } + + /* Link destination may be omitted, but only when not also having a title. */ + if(off < ctx->size && CH(off) == _T(')')) { + attr->dest_beg = off; + attr->dest_end = off; + attr->title = NULL; + attr->title_size = 0; + attr->title_needs_free = FALSE; + off++; + *p_end = off; + return TRUE; + } + + /* Link destination. */ + if(!md_is_link_destination(ctx, off, lines[line_index].end, + &off, &attr->dest_beg, &attr->dest_end)) + return FALSE; + + /* (Optional) title. */ + if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, + &off, &title_contents_line_index, &tmp_line_index, + &title_contents_beg, &title_contents_end)) + { + title_is_multiline = (tmp_line_index != title_contents_line_index); + title_contents_line_index += line_index; + line_index += tmp_line_index; + } else { + /* Not a title. */ + title_is_multiline = FALSE; + title_contents_beg = off; + title_contents_end = off; + title_contents_line_index = 0; + } + + /* Optional whitespace followed with final ')'. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end && ISNEWLINE(off)) { + line_index++; + if(line_index >= n_lines) + return FALSE; + off = lines[line_index].beg; + } + if(CH(off) != _T(')')) + goto abort; + off++; + + if(title_contents_beg >= title_contents_end) { + attr->title = NULL; + attr->title_size = 0; + attr->title_needs_free = FALSE; + } else if(!title_is_multiline) { + attr->title = (CHAR*) STR(title_contents_beg); + attr->title_size = title_contents_end - title_contents_beg; + attr->title_needs_free = FALSE; + } else { + MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, + lines + title_contents_line_index, n_lines - title_contents_line_index, + _T('\n'), &attr->title, &attr->title_size)); + attr->title_needs_free = TRUE; + } + + *p_end = off; + ret = TRUE; + +abort: + return ret; +} + +static void +md_free_ref_defs(MD_CTX* ctx) +{ + int i; + + for(i = 0; i < ctx->n_ref_defs; i++) { + MD_REF_DEF* def = &ctx->ref_defs[i]; + + if(!IS_INPUT_STR(def->label)) + free(def->label); + if(!IS_INPUT_STR(def->title)) + free(def->title); + } + + free(ctx->ref_defs); +} + + +/****************************************** + *** Processing Inlines (a.k.a Spans) *** + ******************************************/ + +/* We process inlines in few phases: + * + * (1) We go through the block text and collect all significant characters + * which may start/end a span or some other significant position into + * ctx->marks[]. Core of this is what md_collect_marks() does. + * + * We also do some very brief preliminary context-less analysis, whether + * it might be opener or closer (e.g. of an emphasis span). + * + * This speeds the other steps as we do not need to re-iterate over all + * characters anymore. + * + * (2) We analyze each potential mark types, in order by their precedence. + * + * In each md_analyze_XXX() function, we re-iterate list of the marks, + * skipping already resolved regions (in preceding precedences) and try to + * resolve them. + * + * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark + * them as resolved. + * + * (2.2) For range-type marks, we analyze whether the mark could be closer + * and, if yes, whether there is some preceding opener it could satisfy. + * + * If not we check whether it could be really an opener and if yes, we + * remember it so subsequent closers may resolve it. + * + * (3) Finally, when all marks were analyzed, we render the block contents + * by calling MD_RENDERER::text() callback, interrupting by ::enter_span() + * or ::close_span() whenever we reach a resolved mark. + */ + + +/* The mark structure. + * + * '\\': Maybe escape sequence. + * '\0': NULL char. + * '*': Maybe (strong) emphasis start/end. + * '_': Maybe (strong) emphasis start/end. + * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH). + * '`': Maybe code span start/end. + * '&': Maybe start of entity. + * ';': Maybe end of entity. + * '<': Maybe start of raw HTML or autolink. + * '>': Maybe end of raw HTML or autolink. + * '[': Maybe start of link label or link text. + * '!': Equivalent of '[' for image. + * ']': Maybe end of link label or link text. + * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS). + * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). + * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS). + * 'D': Dummy mark, it reserves a space for splitting a previous mark + * (e.g. emphasis) or to make more space for storing some special data + * related to the preceding mark (e.g. link). + * + * Note that not all instances of these chars in the text imply creation of the + * structure. Only those which have (or may have, after we see more context) + * the special meaning. + * + * (Keep this struct as small as possible to fit as much of them into CPU + * cache line.) + */ +struct MD_MARK_tag { + OFF beg; + OFF end; + + /* For unresolved openers, 'prev' and 'next' form the chain of open openers + * of given type 'ch'. + * + * During resolving, we disconnect from the chain and point to the + * corresponding counterpart so opener points to its closer and vice versa. + */ + int prev; + int next; + CHAR ch; + unsigned char flags; +}; + +/* Mark flags (these apply to ALL mark types). */ +#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */ +#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */ +#define MD_MARK_OPENER 0x04 /* Definitely opener. */ +#define MD_MARK_CLOSER 0x08 /* Definitely closer. */ +#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */ + +/* Mark flags specific for various mark types (so they can share bits). */ +#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */ +#define MD_MARK_EMPH_MOD3_0 0x40 +#define MD_MARK_EMPH_MOD3_1 0x80 +#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80) +#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80) +#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */ +#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */ + +static MD_MARKCHAIN* +md_asterisk_chain(MD_CTX* ctx, unsigned flags) +{ + switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) { + case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0; + case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1; + case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2; + case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0; + case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1; + case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2; + default: MD_UNREACHABLE(); + } + return NULL; +} + +static MD_MARKCHAIN* +md_mark_chain(MD_CTX* ctx, int mark_index) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + + switch(mark->ch) { + case _T('*'): return md_asterisk_chain(ctx, mark->flags); + case _T('_'): return &UNDERSCORE_OPENERS; + case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2; + case _T('['): return &BRACKET_OPENERS; + case _T('|'): return &TABLECELLBOUNDARIES; + default: return NULL; + } +} + +static MD_MARK* +md_push_mark(MD_CTX* ctx) +{ + if(ctx->n_marks >= ctx->alloc_marks) { + MD_MARK* new_marks; + + ctx->alloc_marks = (ctx->alloc_marks > 0 + ? ctx->alloc_marks + ctx->alloc_marks / 2 + : 64); + new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK)); + if(new_marks == NULL) { + MD_LOG("realloc() failed."); + return NULL; + } + + ctx->marks = new_marks; + } + + return &ctx->marks[ctx->n_marks++]; +} + +#define PUSH_MARK_() \ + do { \ + mark = md_push_mark(ctx); \ + if(mark == NULL) { \ + ret = -1; \ + goto abort; \ + } \ + } while(0) + +#define PUSH_MARK(ch_, beg_, end_, flags_) \ + do { \ + PUSH_MARK_(); \ + mark->beg = (beg_); \ + mark->end = (end_); \ + mark->prev = -1; \ + mark->next = -1; \ + mark->ch = (char)(ch_); \ + mark->flags = (flags_); \ + } while(0) + + +static void +md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index) +{ + if(chain->tail >= 0) + ctx->marks[chain->tail].next = mark_index; + else + chain->head = mark_index; + + ctx->marks[mark_index].prev = chain->tail; + ctx->marks[mark_index].next = -1; + chain->tail = mark_index; +} + +/* Sometimes, we need to store a pointer into the mark. It is quite rare + * so we do not bother to make MD_MARK use union, and it can only happen + * for dummy marks. */ +static inline void +md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + MD_ASSERT(mark->ch == 'D'); + + /* Check only members beg and end are misused for this. */ + MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF)); + memcpy(mark, &ptr, sizeof(void*)); +} + +static inline void* +md_mark_get_ptr(MD_CTX* ctx, int mark_index) +{ + void* ptr; + MD_MARK* mark = &ctx->marks[mark_index]; + MD_ASSERT(mark->ch == 'D'); + memcpy(&ptr, mark, sizeof(void*)); + return ptr; +} + +static void +md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index) +{ + MD_MARK* opener = &ctx->marks[opener_index]; + MD_MARK* closer = &ctx->marks[closer_index]; + + /* Remove opener from the list of openers. */ + if(chain != NULL) { + if(opener->prev >= 0) + ctx->marks[opener->prev].next = opener->next; + else + chain->head = opener->next; + + if(opener->next >= 0) + ctx->marks[opener->next].prev = opener->prev; + else + chain->tail = opener->prev; + } + + /* Interconnect opener and closer and mark both as resolved. */ + opener->next = closer_index; + opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; + closer->prev = opener_index; + closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; +} + + +#define MD_ROLLBACK_ALL 0 +#define MD_ROLLBACK_CROSSING 1 + +/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all + * resolvings accordingly to these rules: + * + * (1) All openers BEFORE the range corresponding to any closer inside the + * range are un-resolved and they are re-added to their respective chains + * of unresolved openers. This ensures we can reuse the opener for closers + * AFTER the range. + * + * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range + * are discarded. + * + * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled + * in (1) are discarded. I.e. pairs of openers and closers which are both + * inside the range are retained as well as any unpaired marks. + */ +static void +md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how) +{ + int i; + int mark_index; + + /* Cut all unresolved openers at the mark index. */ + for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) { + MD_MARKCHAIN* chain = &ctx->mark_chains[i]; + + while(chain->tail >= opener_index) + chain->tail = ctx->marks[chain->tail].prev; + + if(chain->tail >= 0) + ctx->marks[chain->tail].next = -1; + else + chain->head = -1; + } + + /* Go backwards so that unresolved openers are re-added into their + * respective chains, in the right order. */ + mark_index = closer_index - 1; + while(mark_index > opener_index) { + MD_MARK* mark = &ctx->marks[mark_index]; + int mark_flags = mark->flags; + int discard_flag = (how == MD_ROLLBACK_ALL); + + if(mark->flags & MD_MARK_CLOSER) { + int mark_opener_index = mark->prev; + + /* Undo opener BEFORE the range. */ + if(mark_opener_index < opener_index) { + MD_MARK* mark_opener = &ctx->marks[mark_opener_index]; + MD_MARKCHAIN* chain; + + mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); + chain = md_mark_chain(ctx, opener_index); + if(chain != NULL) { + md_mark_chain_append(ctx, chain, mark_opener_index); + discard_flag = 1; + } + } + } + + /* And reset our flags. */ + if(discard_flag) + mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); + + /* Jump as far as we can over unresolved or non-interesting marks. */ + switch(how) { + case MD_ROLLBACK_CROSSING: + if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) { + /* If we are closer with opener INSIDE the range, there may + * not be any other crosser inside the subrange. */ + mark_index = mark->prev; + break; + } + /* Pass through. */ + default: + mark_index--; + break; + } + } +} + +static void +md_build_mark_char_map(MD_CTX* ctx) +{ + memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map)); + + ctx->mark_char_map['\\'] = 1; + ctx->mark_char_map['*'] = 1; + ctx->mark_char_map['_'] = 1; + ctx->mark_char_map['`'] = 1; + ctx->mark_char_map['&'] = 1; + ctx->mark_char_map[';'] = 1; + ctx->mark_char_map['<'] = 1; + ctx->mark_char_map['>'] = 1; + ctx->mark_char_map['['] = 1; + ctx->mark_char_map['!'] = 1; + ctx->mark_char_map[']'] = 1; + ctx->mark_char_map['\0'] = 1; + + if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH) + ctx->mark_char_map['~'] = 1; + + if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS) + ctx->mark_char_map['$'] = 1; + + if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) + ctx->mark_char_map['@'] = 1; + + if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) + ctx->mark_char_map[':'] = 1; + + if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) + ctx->mark_char_map['.'] = 1; + + if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS)) + ctx->mark_char_map['|'] = 1; + + if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) { + int i; + + for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) { + if(ISWHITESPACE_(i)) + ctx->mark_char_map[i] = 1; + } + } +} + +/* We limit code span marks to lower then 32 backticks. This solves the + * pathologic case of too many openers, each of different length: Their + * resolving would be then O(n^2). */ +#define CODESPAN_MARK_MAXLEN 32 + +static int +md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, + OFF* p_opener_beg, OFF* p_opener_end, + OFF* p_closer_beg, OFF* p_closer_end, + OFF last_potential_closers[CODESPAN_MARK_MAXLEN], + int* p_reached_paragraph_end) +{ + OFF opener_beg = beg; + OFF opener_end; + OFF closer_beg; + OFF closer_end; + SZ mark_len; + OFF line_end; + int has_space_after_opener = FALSE; + int has_eol_after_opener = FALSE; + int has_space_before_closer = FALSE; + int has_eol_before_closer = FALSE; + int has_only_space = TRUE; + int line_index = 0; + + line_end = lines[0].end; + opener_end = opener_beg; + while(opener_end < line_end && CH(opener_end) == _T('`')) + opener_end++; + has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' ')); + has_eol_after_opener = (opener_end == line_end); + + /* The caller needs to know end of the opening mark even if we fail. */ + *p_opener_end = opener_end; + + mark_len = opener_end - opener_beg; + if(mark_len > CODESPAN_MARK_MAXLEN) + return FALSE; + + /* Check whether we already know there is no closer of this length. + * If so, re-scan does no sense. This fixes issue #59. */ + if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end || + (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end)) + return FALSE; + + closer_beg = opener_end; + closer_end = opener_end; + + /* Find closer mark. */ + while(TRUE) { + while(closer_beg < line_end && CH(closer_beg) != _T('`')) { + if(CH(closer_beg) != _T(' ')) + has_only_space = FALSE; + closer_beg++; + } + closer_end = closer_beg; + while(closer_end < line_end && CH(closer_end) == _T('`')) + closer_end++; + + if(closer_end - closer_beg == mark_len) { + /* Success. */ + has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' ')); + has_eol_before_closer = (closer_beg == lines[line_index].beg); + break; + } + + if(closer_end - closer_beg > 0) { + /* We have found a back-tick which is not part of the closer. */ + has_only_space = FALSE; + + /* But if we eventually fail, remember it as a potential closer + * of its own length for future attempts. This mitigates needs for + * rescans. */ + if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) { + if(closer_beg > last_potential_closers[closer_end - closer_beg - 1]) + last_potential_closers[closer_end - closer_beg - 1] = closer_beg; + } + } + + if(closer_end >= line_end) { + line_index++; + if(line_index >= n_lines) { + /* Reached end of the paragraph and still nothing. */ + *p_reached_paragraph_end = TRUE; + return FALSE; + } + /* Try on the next line. */ + line_end = lines[line_index].end; + closer_beg = lines[line_index].beg; + } else { + closer_beg = closer_end; + } + } + + /* If there is a space or a new line both after and before the opener + * (and if the code span is not made of spaces only), consume one initial + * and one trailing space as part of the marks. */ + if(!has_only_space && + (has_space_after_opener || has_eol_after_opener) && + (has_space_before_closer || has_eol_before_closer)) + { + if(has_space_after_opener) + opener_end++; + else + opener_end = lines[1].beg; + + if(has_space_before_closer) + closer_beg--; + else { + closer_beg = lines[line_index-1].end; + /* We need to eat the preceding "\r\n" but not any line trailing + * spaces. */ + while(closer_beg < ctx->size && ISBLANK(closer_beg)) + closer_beg++; + } + } + + *p_opener_beg = opener_beg; + *p_opener_end = opener_end; + *p_closer_beg = closer_beg; + *p_closer_end = closer_end; + return TRUE; +} + +static int +md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg+1; + + MD_ASSERT(CH(beg) == _T('<')); + + /* Check for scheme. */ + if(off >= max_end || !ISASCII(off)) + return FALSE; + off++; + while(1) { + if(off >= max_end) + return FALSE; + if(off - beg > 32) + return FALSE; + if(CH(off) == _T(':') && off - beg >= 3) + break; + if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.')) + return FALSE; + off++; + } + + /* Check the path after the scheme. */ + while(off < max_end && CH(off) != _T('>')) { + if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<')) + return FALSE; + off++; + } + + if(off >= max_end) + return FALSE; + + MD_ASSERT(CH(off) == _T('>')); + *p_end = off+1; + return TRUE; +} + +static int +md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg + 1; + int label_len; + + MD_ASSERT(CH(beg) == _T('<')); + + /* The code should correspond to this regexp: + /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+ + @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + */ + + /* Username (before '@'). */ + while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-")))) + off++; + if(off <= beg+1) + return FALSE; + + /* '@' */ + if(off >= max_end || CH(off) != _T('@')) + return FALSE; + off++; + + /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum + * characters or '-', but '-' is not allowed as first or last char. */ + label_len = 0; + while(off < max_end) { + if(ISALNUM(off)) + label_len++; + else if(CH(off) == _T('-') && label_len > 0) + label_len++; + else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-')) + label_len = 0; + else + break; + + if(label_len > 63) + return FALSE; + + off++; + } + + if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-')) + return FALSE; + + *p_end = off+1; + return TRUE; +} + +static int +md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto) +{ + if(md_is_autolink_uri(ctx, beg, max_end, p_end)) { + *p_missing_mailto = FALSE; + return TRUE; + } + + if(md_is_autolink_email(ctx, beg, max_end, p_end)) { + *p_missing_mailto = TRUE; + return TRUE; + } + + return FALSE; +} + +static int +md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) +{ + int i; + int ret = 0; + MD_MARK* mark; + OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 }; + int codespan_scanned_till_paragraph_end = FALSE; + + for(i = 0; i < n_lines; i++) { + const MD_LINE* line = &lines[i]; + OFF off = line->beg; + OFF line_end = line->end; + + while(TRUE) { + CHAR ch; + +#ifdef MD4C_USE_UTF16 + /* For UTF-16, mark_char_map[] covers only ASCII. */ + #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \ + (ctx->mark_char_map[(unsigned char) CH(off)])) +#else + /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */ + #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)]) +#endif + + /* Optimization: Use some loop unrolling. */ + while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1) + && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3)) + off += 4; + while(off < line_end && !IS_MARK_CHAR(off+0)) + off++; + + if(off >= line_end) + break; + + ch = CH(off); + + /* A backslash escape. + * It can go beyond line->end as it may involve escaped new + * line to form a hard break. */ + if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { + /* Hard-break cannot be on the last line of the block. */ + if(!ISNEWLINE(off+1) || i+1 < n_lines) + PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED); + off += 2; + continue; + } + + /* A potential (string) emphasis start/end. */ + if(ch == _T('*') || ch == _T('_')) { + OFF tmp = off+1; + int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */ + int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */ + + while(tmp < line_end && CH(tmp) == ch) + tmp++; + + if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off)) + left_level = 0; + else if(ISUNICODEPUNCTBEFORE(off)) + left_level = 1; + else + left_level = 2; + + if(tmp == line_end || ISUNICODEWHITESPACE(tmp)) + right_level = 0; + else if(ISUNICODEPUNCT(tmp)) + right_level = 1; + else + right_level = 2; + + /* Intra-word underscore doesn't have special meaning. */ + if(ch == _T('_') && left_level == 2 && right_level == 2) { + left_level = 0; + right_level = 0; + } + + if(left_level != 0 || right_level != 0) { + unsigned flags = 0; + + if(left_level > 0 && left_level >= right_level) + flags |= MD_MARK_POTENTIAL_CLOSER; + if(right_level > 0 && right_level >= left_level) + flags |= MD_MARK_POTENTIAL_OPENER; + if(left_level == 2 && right_level == 2) + flags |= MD_MARK_EMPH_INTRAWORD; + + /* For "the rule of three" we need to remember the original + * size of the mark (modulo three), before we potentially + * split the mark when being later resolved partially by some + * shorter closer. */ + switch((tmp - off) % 3) { + case 0: flags |= MD_MARK_EMPH_MOD3_0; break; + case 1: flags |= MD_MARK_EMPH_MOD3_1; break; + case 2: flags |= MD_MARK_EMPH_MOD3_2; break; + } + + PUSH_MARK(ch, off, tmp, flags); + + /* During resolving, multiple asterisks may have to be + * split into independent span start/ends. Consider e.g. + * "**foo* bar*". Therefore we push also some empty dummy + * marks to have enough space for that. */ + off++; + while(off < tmp) { + PUSH_MARK('D', off, off, 0); + off++; + } + continue; + } + + off = tmp; + continue; + } + + /* A potential code span start/end. */ + if(ch == _T('`')) { + OFF opener_beg, opener_end; + OFF closer_beg, closer_end; + int is_code_span; + + is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off, + &opener_beg, &opener_end, &closer_beg, &closer_end, + codespan_last_potential_closers, + &codespan_scanned_till_paragraph_end); + if(is_code_span) { + PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + + off = closer_end; + + /* Advance the current line accordingly. */ + while(off > line_end) { + i++; + line++; + line_end = line->end; + } + continue; + } + + off = opener_end; + continue; + } + + /* A potential entity start. */ + if(ch == _T('&')) { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + off++; + continue; + } + + /* A potential entity end. */ + if(ch == _T(';')) { + /* We surely cannot be entity unless the previous mark is '&'. */ + if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&')) + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); + + off++; + continue; + } + + /* A potential autolink or raw HTML start/end. */ + if(ch == _T('<')) { + int is_autolink; + OFF autolink_end; + int missing_mailto; + + if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) { + int is_html; + OFF html_end; + + /* Given the nature of the raw HTML, we have to recognize + * it here. Doing so later in md_analyze_lt_gt() could + * open can of worms of quadratic complexity. */ + is_html = md_is_html_any(ctx, lines + i, n_lines - i, off, + lines[n_lines-1].end, &html_end); + if(is_html) { + PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + off = html_end; + + /* Advance the current line accordingly. */ + while(off > line_end) { + i++; + line++; + line_end = line->end; + } + continue; + } + } + + is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end, + &autolink_end, &missing_mailto); + if(is_autolink) { + PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1, + MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK); + PUSH_MARK(_T('>'), autolink_end-1, autolink_end, + MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + off = autolink_end; + continue; + } + + off++; + continue; + } + + /* A potential link or its part. */ + if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) { + OFF tmp = (ch == _T('[') ? off+1 : off+2); + PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER); + off = tmp; + /* Two dummies to make enough place for data we need if it is + * a link. */ + PUSH_MARK('D', off, off, 0); + PUSH_MARK('D', off, off, 0); + continue; + } + if(ch == _T(']')) { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); + off++; + continue; + } + + /* A potential permissive e-mail autolink. */ + if(ch == _T('@')) { + if(line->beg + 1 <= off && ISALNUM(off-1) && + off + 3 < line->end && ISALNUM(off+1)) + { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + } + + off++; + continue; + } + + /* A potential permissive URL autolink. */ + if(ch == _T(':')) { + static struct { + const CHAR* scheme; + SZ scheme_size; + const CHAR* suffix; + SZ suffix_size; + } scheme_map[] = { + /* In the order from the most frequently used, arguably. */ + { _T("http"), 4, _T("//"), 2 }, + { _T("https"), 5, _T("//"), 2 }, + { _T("ftp"), 3, _T("//"), 2 } + }; + int scheme_index; + + for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) { + const CHAR* scheme = scheme_map[scheme_index].scheme; + const SZ scheme_size = scheme_map[scheme_index].scheme_size; + const CHAR* suffix = scheme_map[scheme_index].suffix; + const SZ suffix_size = scheme_map[scheme_index].suffix_size; + + if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) && + (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) && + off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size)) + { + PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + off += 1 + suffix_size; + continue; + } + } + + off++; + continue; + } + + /* A potential permissive WWW autolink. */ + if(ch == _T('.')) { + if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) && + (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) && + off + 1 < line_end) + { + PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + off++; + continue; + } + + off++; + continue; + } + + /* A potential table cell boundary or wiki link label delimiter. */ + if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) { + PUSH_MARK(ch, off, off+1, 0); + off++; + continue; + } + + /* A potential strikethrough start/end. */ + if(ch == _T('~')) { + OFF tmp = off+1; + + while(tmp < line_end && CH(tmp) == _T('~')) + tmp++; + + if(tmp - off < 3) { + unsigned flags = 0; + + if(tmp < line_end && !ISUNICODEWHITESPACE(tmp)) + flags |= MD_MARK_POTENTIAL_OPENER; + if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off)) + flags |= MD_MARK_POTENTIAL_CLOSER; + if(flags != 0) + PUSH_MARK(ch, off, tmp, flags); + } + + off = tmp; + continue; + } + + /* A potential equation start/end */ + if(ch == _T('$')) { + /* We can have at most two consecutive $ signs, + * where two dollar signs signify a display equation. */ + OFF tmp = off+1; + + while(tmp < line_end && CH(tmp) == _T('$')) + tmp++; + + if (tmp - off <= 2) + PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER); + off = tmp; + continue; + } + + /* Turn non-trivial whitespace into single space. */ + if(ISWHITESPACE_(ch)) { + OFF tmp = off+1; + + while(tmp < line_end && ISWHITESPACE(tmp)) + tmp++; + + if(tmp - off > 1 || ch != _T(' ')) + PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED); + + off = tmp; + continue; + } + + /* NULL character. */ + if(ch == _T('\0')) { + PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED); + off++; + continue; + } + + off++; + } + } + + /* Add a dummy mark at the end of the mark vector to simplify + * process_inlines(). */ + PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED); + +abort: + return ret; +} + +static void +md_analyze_bracket(MD_CTX* ctx, int mark_index) +{ + /* We cannot really resolve links here as for that we would need + * more context. E.g. a following pair of brackets (reference link), + * or enclosing pair of brackets (if the inner is the link, the outer + * one cannot be.) + * + * Therefore we here only construct a list of resolved '[' ']' pairs + * ordered by position of the closer. This allows ur to analyze what is + * or is not link in the right order, from inside to outside in case + * of nested brackets. + * + * The resolving itself is deferred into md_resolve_links(). + */ + + MD_MARK* mark = &ctx->marks[mark_index]; + + if(mark->flags & MD_MARK_POTENTIAL_OPENER) { + md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index); + return; + } + + if(BRACKET_OPENERS.tail >= 0) { + /* Pop the opener from the chain. */ + int opener_index = BRACKET_OPENERS.tail; + MD_MARK* opener = &ctx->marks[opener_index]; + if(opener->prev >= 0) + ctx->marks[opener->prev].next = -1; + else + BRACKET_OPENERS.head = -1; + BRACKET_OPENERS.tail = opener->prev; + + /* Interconnect the opener and closer. */ + opener->next = mark_index; + mark->prev = opener_index; + + /* Add the pair into chain of potential links for md_resolve_links(). + * Note we misuse opener->prev for this as opener->next points to its + * closer. */ + if(ctx->unresolved_link_tail >= 0) + ctx->marks[ctx->unresolved_link_tail].prev = opener_index; + else + ctx->unresolved_link_head = opener_index; + ctx->unresolved_link_tail = opener_index; + opener->prev = -1; + } +} + +/* Forward declaration. */ +static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + int mark_beg, int mark_end); + +static int +md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + int opener_index = ctx->unresolved_link_head; + OFF last_link_beg = 0; + OFF last_link_end = 0; + OFF last_img_beg = 0; + OFF last_img_end = 0; + + while(opener_index >= 0) { + MD_MARK* opener = &ctx->marks[opener_index]; + int closer_index = opener->next; + MD_MARK* closer = &ctx->marks[closer_index]; + int next_index = opener->prev; + MD_MARK* next_opener; + MD_MARK* next_closer; + MD_LINK_ATTR attr; + int is_link = FALSE; + + if(next_index >= 0) { + next_opener = &ctx->marks[next_index]; + next_closer = &ctx->marks[next_opener->next]; + } else { + next_opener = NULL; + next_closer = NULL; + } + + /* If nested ("[ [ ] ]"), we need to make sure that: + * - The outer does not end inside of (...) belonging to the inner. + * - The outer cannot be link if the inner is link (i.e. not image). + * + * (Note we here analyze from inner to outer as the marks are ordered + * by closer->beg.) + */ + if((opener->beg < last_link_beg && closer->end < last_link_end) || + (opener->beg < last_img_beg && closer->end < last_img_end) || + (opener->beg < last_link_end && opener->ch == '[')) + { + opener_index = next_index; + continue; + } + + /* Recognize and resolve wiki links. + * Wiki-links maybe '[[destination]]' or '[[destination|label]]'. + */ + if ((ctx->parser.flags & MD_FLAG_WIKILINKS) && + (opener->end - opener->beg == 1) && /* not image */ + next_opener != NULL && /* double '[' opener */ + next_opener->ch == '[' && + (next_opener->beg == opener->beg - 1) && + (next_opener->end - next_opener->beg == 1) && + next_closer != NULL && /* double ']' closer */ + next_closer->ch == ']' && + (next_closer->beg == closer->beg + 1) && + (next_closer->end - next_closer->beg == 1)) + { + MD_MARK* delim = NULL; + int delim_index; + OFF dest_beg, dest_end; + + is_link = TRUE; + + /* We don't allow destination to be longer then 100 characters. + * Lets scan to see whether there is '|'. (If not then the whole + * wiki-link has to be below the 100 characters.) */ + delim_index = opener_index + 1; + while(delim_index < closer_index) { + MD_MARK* m = &ctx->marks[delim_index]; + if(m->ch == '|') { + delim = m; + break; + } + if(m->ch != 'D' && m->beg - opener->end > 100) + break; + delim_index++; + } + dest_beg = opener->end; + dest_end = (delim != NULL) ? delim->beg : closer->beg; + if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100) + is_link = FALSE; + + /* There may not be any new line in the destination. */ + if(is_link) { + OFF off; + for(off = dest_beg; off < dest_end; off++) { + if(ISNEWLINE(off)) { + is_link = FALSE; + break; + } + } + } + + if(is_link) { + if(delim != NULL) { + if(delim->end < closer->beg) { + opener->end = delim->beg; + } else { + /* The pipe is just before the closer: [[foo|]] */ + closer->beg = delim->beg; + delim = NULL; + } + } + + opener->beg = next_opener->beg; + opener->next = closer_index; + opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; + + closer->end = next_closer->end; + closer->prev = opener_index; + closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; + + last_link_beg = opener->beg; + last_link_end = closer->end; + + if(delim != NULL) { + delim->flags |= MD_MARK_RESOLVED; + md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL); + md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index); + } else { + md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL); + } + + opener_index = next_opener->prev; + continue; + } + } + + if(next_opener != NULL && next_opener->beg == closer->end) { + if(next_closer->beg > closer->end + 1) { + /* Might be full reference link. */ + is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr); + } else { + /* Might be shortcut reference link. */ + is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr); + } + + if(is_link < 0) + return -1; + + if(is_link) { + /* Eat the 2nd "[...]". */ + closer->end = next_closer->end; + } + } else { + if(closer->end < ctx->size && CH(closer->end) == _T('(')) { + /* Might be inline link. */ + OFF inline_link_end = UINT_MAX; + + is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr); + if(is_link < 0) + return -1; + + /* Check the closing ')' is not inside an already resolved range + * (i.e. a range with a higher priority), e.g. a code span. */ + if(is_link) { + int i = closer_index + 1; + + while(i < ctx->n_marks) { + MD_MARK* mark = &ctx->marks[i]; + + if(mark->beg >= inline_link_end) + break; + if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) { + if(ctx->marks[mark->next].beg >= inline_link_end) { + /* Cancel the link status. */ + if(!IS_INPUT_STR(attr.title)) + free(attr.title); + is_link = FALSE; + break; + } + + i = mark->next + 1; + } else { + i++; + } + } + } + + if(is_link) { + /* Eat the "(...)" */ + closer->end = inline_link_end; + } + } + + if(!is_link) { + /* Might be collapsed reference link. */ + is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr); + if(is_link < 0) + return -1; + } + } + + if(is_link) { + /* Resolve the brackets as a link. */ + opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; + closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; + + /* If it is a link, we store the destination and title in the two + * dummy marks after the opener. */ + MD_ASSERT(ctx->marks[opener_index+1].ch == 'D'); + ctx->marks[opener_index+1].beg = attr.dest_beg; + ctx->marks[opener_index+1].end = attr.dest_end; + + MD_ASSERT(ctx->marks[opener_index+2].ch == 'D'); + md_mark_store_ptr(ctx, opener_index+2, attr.title); + /* The title might or might not have been allocated for us. */ + if(attr.title_needs_free) + md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2); + ctx->marks[opener_index+2].prev = attr.title_size; + + if(opener->ch == '[') { + last_link_beg = opener->beg; + last_link_end = closer->end; + } else { + last_img_beg = opener->beg; + last_img_end = closer->end; + } + + md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index); + } + + opener_index = next_index; + } + + return 0; +} + +/* Analyze whether the mark '&' starts a HTML entity. + * If so, update its flags as well as flags of corresponding closer ';'. */ +static void +md_analyze_entity(MD_CTX* ctx, int mark_index) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + MD_MARK* closer; + OFF off; + + /* Cannot be entity if there is no closer as the next mark. + * (Any other mark between would mean strange character which cannot be + * part of the entity. + * + * So we can do all the work on '&' and do not call this later for the + * closing mark ';'. + */ + if(mark_index + 1 >= ctx->n_marks) + return; + closer = &ctx->marks[mark_index+1]; + if(closer->ch != ';') + return; + + if(md_is_entity(ctx, opener->beg, closer->end, &off)) { + MD_ASSERT(off == closer->end); + + md_resolve_range(ctx, NULL, mark_index, mark_index+1); + opener->end = closer->end; + } +} + +static void +md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + mark->flags |= MD_MARK_RESOLVED; + + md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index); + ctx->n_table_cell_boundaries++; +} + +/* Split a longer mark into two. The new mark takes the given count of + * characters. May only be called if an adequate number of dummy 'D' marks + * follows. + */ +static int +md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + int new_mark_index = mark_index + (mark->end - mark->beg - n); + MD_MARK* dummy = &ctx->marks[new_mark_index]; + + MD_ASSERT(mark->end - mark->beg > n); + MD_ASSERT(dummy->ch == 'D'); + + memcpy(dummy, mark, sizeof(MD_MARK)); + mark->end -= n; + dummy->beg = mark->end; + + return new_mark_index; +} + +static void +md_analyze_emph(MD_CTX* ctx, int mark_index) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index); + + /* If we can be a closer, try to resolve with the preceding opener. */ + if(mark->flags & MD_MARK_POTENTIAL_CLOSER) { + MD_MARK* opener = NULL; + int opener_index; + + if(mark->ch == _T('*')) { + MD_MARKCHAIN* opener_chains[6]; + int i, n_opener_chains; + unsigned flags = mark->flags; + + /* Apply the "rule of three". */ + n_opener_chains = 0; + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0; + if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2) + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1; + if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1) + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2; + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0; + if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2) + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1; + if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1) + opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2; + + /* Opener is the most recent mark from the allowed chains. */ + for(i = 0; i < n_opener_chains; i++) { + if(opener_chains[i]->tail >= 0) { + int tmp_index = opener_chains[i]->tail; + MD_MARK* tmp_mark = &ctx->marks[tmp_index]; + if(opener == NULL || tmp_mark->end > opener->end) { + opener_index = tmp_index; + opener = tmp_mark; + } + } + } + } else { + /* Simple emph. mark */ + if(chain->tail >= 0) { + opener_index = chain->tail; + opener = &ctx->marks[opener_index]; + } + } + + /* Resolve, if we have found matching opener. */ + if(opener != NULL) { + SZ opener_size = opener->end - opener->beg; + SZ closer_size = mark->end - mark->beg; + MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index); + + if(opener_size > closer_size) { + opener_index = md_split_emph_mark(ctx, opener_index, closer_size); + md_mark_chain_append(ctx, opener_chain, opener_index); + } else if(opener_size < closer_size) { + md_split_emph_mark(ctx, mark_index, closer_size - opener_size); + } + + md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING); + md_resolve_range(ctx, opener_chain, opener_index, mark_index); + return; + } + } + + /* If we could not resolve as closer, we may be yet be an opener. */ + if(mark->flags & MD_MARK_POTENTIAL_OPENER) + md_mark_chain_append(ctx, chain, mark_index); +} + +static void +md_analyze_tilde(MD_CTX* ctx, int mark_index) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index); + + /* We attempt to be Github Flavored Markdown compatible here. GFM accepts + * only tildes sequences of length 1 and 2, and the length of the opener + * and closer has to match. */ + + if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) { + int opener_index = chain->head; + + md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING); + md_resolve_range(ctx, chain, opener_index, mark_index); + return; + } + + if(mark->flags & MD_MARK_POTENTIAL_OPENER) + md_mark_chain_append(ctx, chain, mark_index); +} + +static void +md_analyze_dollar(MD_CTX* ctx, int mark_index) +{ + /* This should mimic the way inline equations work in LaTeX, so there + * can only ever be one item in the chain (i.e. the dollars can't be + * nested). This is basically the same as the md_analyze_tilde function, + * except that we require matching openers and closers to be of the same + * length. + * + * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */ + if(DOLLAR_OPENERS.head >= 0) { + /* If the potential closer has a non-matching number of $, discard */ + MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head]; + MD_MARK* close = &ctx->marks[mark_index]; + + int opener_index = DOLLAR_OPENERS.head; + md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL); + if (open->end - open->beg == close->end - close->beg) { + /* We are the matching closer */ + md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index); + } else { + /* We don't match the opener, so discard old opener and insert as opener */ + md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index); + } + } else { + /* No unmatched openers, so we are opener */ + md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index); + } +} + +static void +md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + int closer_index = mark_index + 1; + MD_MARK* closer = &ctx->marks[closer_index]; + MD_MARK* next_resolved_mark; + OFF off = opener->end; + int n_dots = FALSE; + int has_underscore_in_last_seg = FALSE; + int has_underscore_in_next_to_last_seg = FALSE; + int n_opened_parenthesis = 0; + + /* Check for domain. */ + while(off < ctx->size) { + if(ISALNUM(off) || CH(off) == _T('-')) { + off++; + } else if(CH(off) == _T('.')) { + /* We must see at least one period. */ + n_dots++; + has_underscore_in_next_to_last_seg = has_underscore_in_last_seg; + has_underscore_in_last_seg = FALSE; + off++; + } else if(CH(off) == _T('_')) { + /* No underscore may be present in the last two domain segments. */ + has_underscore_in_last_seg = TRUE; + off++; + } else { + break; + } + } + if(off > opener->end && CH(off-1) == _T('.')) { + off--; + n_dots--; + } + if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg) + return; + + /* Check for path. */ + next_resolved_mark = closer + 1; + while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED)) + next_resolved_mark++; + while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) { + /* Parenthesis must be balanced. */ + if(CH(off) == _T('(')) { + n_opened_parenthesis++; + } else if(CH(off) == _T(')')) { + if(n_opened_parenthesis > 0) + n_opened_parenthesis--; + else + break; + } + + off++; + } + /* These cannot be last char In such case they are more likely normal + * punctuation. */ + if(ISANYOF(off-1, _T("?!.,:*_~"))) + off--; + + /* Ok. Lets call it auto-link. Adapt opener and create closer to zero + * length so all the contents becomes the link text. */ + MD_ASSERT(closer->ch == 'D'); + opener->end = opener->beg; + closer->ch = opener->ch; + closer->beg = off; + closer->end = off; + md_resolve_range(ctx, NULL, mark_index, closer_index); +} + +/* The permissive autolinks do not have to be enclosed in '<' '>' but we + * instead impose stricter rules what is understood as an e-mail address + * here. Actually any non-alphanumeric characters with exception of '.' + * are prohibited both in username and after '@'. */ +static void +md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + int closer_index; + MD_MARK* closer; + OFF beg = opener->beg; + OFF end = opener->end; + int dot_count = 0; + + MD_ASSERT(CH(beg) == _T('@')); + + /* Scan for name before '@'. */ + while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+")))) + beg--; + + /* Scan for domain after '@'. */ + while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) { + if(CH(end) == _T('.')) + dot_count++; + end++; + } + if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */ + dot_count--; + end--; + } + else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */ + return; + if(CH(end-1) == _T('@') || dot_count == 0) + return; + + /* Ok. Lets call it auto-link. Adapt opener and create closer to zero + * length so all the contents becomes the link text. */ + closer_index = mark_index + 1; + closer = &ctx->marks[closer_index]; + MD_ASSERT(closer->ch == 'D'); + + opener->beg = beg; + opener->end = beg; + closer->ch = opener->ch; + closer->beg = end; + closer->end = end; + md_resolve_range(ctx, NULL, mark_index, closer_index); +} + +static inline void +md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + int mark_beg, int mark_end, const CHAR* mark_chars) +{ + int i = mark_beg; + + while(i < mark_end) { + MD_MARK* mark = &ctx->marks[i]; + + /* Skip resolved spans. */ + if(mark->flags & MD_MARK_RESOLVED) { + if(mark->flags & MD_MARK_OPENER) { + MD_ASSERT(i < mark->next); + i = mark->next + 1; + } else { + i++; + } + continue; + } + + /* Skip marks we do not want to deal with. */ + if(!ISANYOF_(mark->ch, mark_chars)) { + i++; + continue; + } + + /* Analyze the mark. */ + switch(mark->ch) { + case '[': /* Pass through. */ + case '!': /* Pass through. */ + case ']': md_analyze_bracket(ctx, i); break; + case '&': md_analyze_entity(ctx, i); break; + case '|': md_analyze_table_cell_boundary(ctx, i); break; + case '_': /* Pass through. */ + case '*': md_analyze_emph(ctx, i); break; + case '~': md_analyze_tilde(ctx, i); break; + case '$': md_analyze_dollar(ctx, i); break; + case '.': /* Pass through. */ + case ':': md_analyze_permissive_url_autolink(ctx, i); break; + case '@': md_analyze_permissive_email_autolink(ctx, i); break; + } + + i++; + } +} + +/* Analyze marks (build ctx->marks). */ +static int +md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) +{ + int ret; + + /* Reset the previously collected stack of marks. */ + ctx->n_marks = 0; + + /* Collect all marks. */ + MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode)); + + /* We analyze marks in few groups to handle their precedence. */ + /* (1) Entities; code spans; autolinks; raw HTML. */ + md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&")); + + /* (2) Links. */ + md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!")); + MD_CHECK(md_resolve_links(ctx, lines, n_lines)); + BRACKET_OPENERS.head = -1; + BRACKET_OPENERS.tail = -1; + ctx->unresolved_link_head = -1; + ctx->unresolved_link_tail = -1; + + if(table_mode) { + /* (3) Analyze table cell boundaries. + * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(), + * not after, because caller may need it. */ + MD_ASSERT(n_lines == 1); + TABLECELLBOUNDARIES.head = -1; + TABLECELLBOUNDARIES.tail = -1; + ctx->n_table_cell_boundaries = 0; + md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|")); + return ret; + } + + /* (4) Emphasis and strong emphasis; permissive autolinks. */ + md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks); + +abort: + return ret; +} + +static void +md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + int mark_beg, int mark_end) +{ + int i; + + md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:.")); + + for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) { + ctx->mark_chains[i].head = -1; + ctx->mark_chains[i].tail = -1; + } +} + +static int +md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type, + const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest, + const CHAR* title, SZ title_size) +{ + MD_ATTRIBUTE_BUILD href_build = { 0 }; + MD_ATTRIBUTE_BUILD title_build = { 0 }; + MD_SPAN_A_DETAIL det; + int ret = 0; + + /* Note we here rely on fact that MD_SPAN_A_DETAIL and + * MD_SPAN_IMG_DETAIL are binary-compatible. */ + memset(&det, 0, sizeof(MD_SPAN_A_DETAIL)); + MD_CHECK(md_build_attribute(ctx, dest, dest_size, + (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0), + &det.href, &href_build)); + MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build)); + + if(enter) + MD_ENTER_SPAN(type, &det); + else + MD_LEAVE_SPAN(type, &det); + +abort: + md_free_attribute(ctx, &href_build); + md_free_attribute(ctx, &title_build); + return ret; +} + +static int +md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size) +{ + MD_ATTRIBUTE_BUILD target_build = { 0 }; + MD_SPAN_WIKILINK_DETAIL det; + int ret = 0; + + memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL)); + MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build)); + + if (enter) + MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det); + else + MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det); + +abort: + md_free_attribute(ctx, &target_build); + return ret; +} + + +/* Render the output, accordingly to the analyzed ctx->marks. */ +static int +md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + MD_TEXTTYPE text_type; + const MD_LINE* line = lines; + MD_MARK* prev_mark = NULL; + MD_MARK* mark; + OFF off = lines[0].beg; + OFF end = lines[n_lines-1].end; + int enforce_hardbreak = 0; + int ret = 0; + + /* Find first resolved mark. Note there is always at least one resolved + * mark, the dummy last one after the end of the latest line we actually + * never really reach. This saves us of a lot of special checks and cases + * in this function. */ + mark = ctx->marks; + while(!(mark->flags & MD_MARK_RESOLVED)) + mark++; + + text_type = MD_TEXT_NORMAL; + + while(1) { + /* Process the text up to the next mark or end-of-line. */ + OFF tmp = (line->end < mark->beg ? line->end : mark->beg); + if(tmp > off) { + MD_TEXT(text_type, STR(off), tmp - off); + off = tmp; + } + + /* If reached the mark, process it and move to next one. */ + if(off >= mark->beg) { + switch(mark->ch) { + case '\\': /* Backslash escape. */ + if(ISNEWLINE(mark->beg+1)) + enforce_hardbreak = 1; + else + MD_TEXT(text_type, STR(mark->beg+1), 1); + break; + + case ' ': /* Non-trivial space. */ + MD_TEXT(text_type, _T(" "), 1); + break; + + case '`': /* Code span. */ + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_CODE, NULL); + text_type = MD_TEXT_CODE; + } else { + MD_LEAVE_SPAN(MD_SPAN_CODE, NULL); + text_type = MD_TEXT_NORMAL; + } + break; + + case '_': /* Underline (or emphasis if we fall through). */ + if(ctx->parser.flags & MD_FLAG_UNDERLINE) { + if(mark->flags & MD_MARK_OPENER) { + while(off < mark->end) { + MD_ENTER_SPAN(MD_SPAN_U, NULL); + off++; + } + } else { + while(off < mark->end) { + MD_LEAVE_SPAN(MD_SPAN_U, NULL); + off++; + } + } + break; + } + /* Fall though. */ + + case '*': /* Emphasis, strong emphasis. */ + if(mark->flags & MD_MARK_OPENER) { + if((mark->end - off) % 2) { + MD_ENTER_SPAN(MD_SPAN_EM, NULL); + off++; + } + while(off + 1 < mark->end) { + MD_ENTER_SPAN(MD_SPAN_STRONG, NULL); + off += 2; + } + } else { + while(off + 1 < mark->end) { + MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL); + off += 2; + } + if((mark->end - off) % 2) { + MD_LEAVE_SPAN(MD_SPAN_EM, NULL); + off++; + } + } + break; + + case '~': + if(mark->flags & MD_MARK_OPENER) + MD_ENTER_SPAN(MD_SPAN_DEL, NULL); + else + MD_LEAVE_SPAN(MD_SPAN_DEL, NULL); + break; + + case '$': + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL); + text_type = MD_TEXT_LATEXMATH; + } else { + MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL); + text_type = MD_TEXT_NORMAL; + } + break; + + case '[': /* Link, wiki link, image. */ + case '!': + case ']': + { + const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]); + const MD_MARK* closer = &ctx->marks[opener->next]; + const MD_MARK* dest_mark; + const MD_MARK* title_mark; + + if ((opener->ch == '[' && closer->ch == ']') && + opener->end - opener->beg >= 2 && + closer->end - closer->beg >= 2) + { + int has_label = (opener->end - opener->beg > 2); + SZ target_sz; + + if(has_label) + target_sz = opener->end - (opener->beg+2); + else + target_sz = closer->beg - opener->end; + + MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'), + has_label ? STR(opener->beg+2) : STR(opener->end), + target_sz)); + + break; + } + + dest_mark = opener+1; + MD_ASSERT(dest_mark->ch == 'D'); + title_mark = opener+2; + MD_ASSERT(title_mark->ch == 'D'); + + MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'), + (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), + STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE, + md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev)); + + /* link/image closer may span multiple lines. */ + if(mark->ch == ']') { + while(mark->end > line->end) + line++; + } + + break; + } + + case '<': + case '>': /* Autolink or raw HTML. */ + if(!(mark->flags & MD_MARK_AUTOLINK)) { + /* Raw HTML. */ + if(mark->flags & MD_MARK_OPENER) + text_type = MD_TEXT_HTML; + else + text_type = MD_TEXT_NORMAL; + break; + } + /* Pass through, if auto-link. */ + + case '@': /* Permissive e-mail autolink. */ + case ':': /* Permissive URL autolink. */ + case '.': /* Permissive WWW autolink. */ + { + MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]); + MD_MARK* closer = &ctx->marks[opener->next]; + const CHAR* dest = STR(opener->end); + SZ dest_size = closer->beg - opener->end; + + /* For permissive auto-links we do not know closer mark + * position at the time of md_collect_marks(), therefore + * it can be out-of-order in ctx->marks[]. + * + * With this flag, we make sure that we output the closer + * only if we processed the opener. */ + if(mark->flags & MD_MARK_OPENER) + closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK; + + if(opener->ch == '@' || opener->ch == '.') { + dest_size += 7; + MD_TEMP_BUFFER(dest_size * sizeof(CHAR)); + memcpy(ctx->buffer, + (opener->ch == '@' ? _T("mailto:") : _T("http://")), + 7 * sizeof(CHAR)); + memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR)); + dest = ctx->buffer; + } + + if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK) + MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER), + MD_SPAN_A, dest, dest_size, TRUE, NULL, 0)); + break; + } + + case '&': /* Entity. */ + MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg); + break; + + case '\0': + MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1); + break; + + case 127: + goto abort; + } + + off = mark->end; + + /* Move to next resolved mark. */ + prev_mark = mark; + mark++; + while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off) + mark++; + } + + /* If reached end of line, move to next one. */ + if(off >= line->end) { + /* If it is the last line, we are done. */ + if(off >= end) + break; + + if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) { + OFF tmp; + + MD_ASSERT(prev_mark != NULL); + MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER)); + MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER)); + + /* Inside a code span, trailing line whitespace has to be + * outputted. */ + tmp = off; + while(off < ctx->size && ISBLANK(off)) + off++; + if(off > tmp) + MD_TEXT(text_type, STR(tmp), off-tmp); + + /* and new lines are transformed into single spaces. */ + if(prev_mark->end < off && off < mark->beg) + MD_TEXT(text_type, _T(" "), 1); + } else if(text_type == MD_TEXT_HTML) { + /* Inside raw HTML, we output the new line verbatim, including + * any trailing spaces. */ + OFF tmp = off; + + while(tmp < end && ISBLANK(tmp)) + tmp++; + if(tmp > off) + MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off); + MD_TEXT(MD_TEXT_HTML, _T("\n"), 1); + } else { + /* Output soft or hard line break. */ + MD_TEXTTYPE break_type = MD_TEXT_SOFTBR; + + if(text_type == MD_TEXT_NORMAL) { + if(enforce_hardbreak) + break_type = MD_TEXT_BR; + else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' '))) + break_type = MD_TEXT_BR; + } + + MD_TEXT(break_type, _T("\n"), 1); + } + + /* Move to the next line. */ + line++; + off = line->beg; + + enforce_hardbreak = 0; + } + } + +abort: + return ret; +} + + +/*************************** + *** Processing Tables *** + ***************************/ + +static void +md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align) +{ + static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER }; + OFF off = beg; + + while(n_align > 0) { + int index = 0; /* index into align_map[] */ + + while(CH(off) != _T('-')) + off++; + if(off > beg && CH(off-1) == _T(':')) + index |= 1; + while(off < end && CH(off) == _T('-')) + off++; + if(off < end && CH(off) == _T(':')) + index |= 2; + + *align = align_map[index]; + align++; + n_align--; + } + +} + +/* Forward declaration. */ +static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines); + +static int +md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end) +{ + MD_LINE line; + MD_BLOCK_TD_DETAIL det; + int ret = 0; + + while(beg < end && ISWHITESPACE(beg)) + beg++; + while(end > beg && ISWHITESPACE(end-1)) + end--; + + det.align = align; + line.beg = beg; + line.end = end; + + MD_ENTER_BLOCK(cell_type, &det); + MD_CHECK(md_process_normal_block_contents(ctx, &line, 1)); + MD_LEAVE_BLOCK(cell_type, &det); + +abort: + return ret; +} + +static int +md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end, + const MD_ALIGN* align, int col_count) +{ + MD_LINE line; + OFF* pipe_offs = NULL; + int i, j, k, n; + int ret = 0; + + line.beg = beg; + line.end = end; + + /* Break the line into table cells by identifying pipe characters who + * form the cell boundary. */ + MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE)); + + /* We have to remember the cell boundaries in local buffer because + * ctx->marks[] shall be reused during cell contents processing. */ + n = ctx->n_table_cell_boundaries + 2; + pipe_offs = (OFF*) malloc(n * sizeof(OFF)); + if(pipe_offs == NULL) { + MD_LOG("malloc() failed."); + ret = -1; + goto abort; + } + j = 0; + pipe_offs[j++] = beg; + for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) { + MD_MARK* mark = &ctx->marks[i]; + pipe_offs[j++] = mark->end; + } + pipe_offs[j++] = end+1; + + /* Process cells. */ + MD_ENTER_BLOCK(MD_BLOCK_TR, NULL); + k = 0; + for(i = 0; i < j-1 && k < col_count; i++) { + if(pipe_offs[i] < pipe_offs[i+1]-1) + MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1)); + } + /* Make sure we call enough table cells even if the current table contains + * too few of them. */ + while(k < col_count) + MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0)); + MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL); + +abort: + free(pipe_offs); + + /* Free any temporary memory blocks stored within some dummy marks. */ + for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next) + free(md_mark_get_ptr(ctx, i)); + PTR_CHAIN.head = -1; + PTR_CHAIN.tail = -1; + + return ret; +} + +static int +md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines) +{ + MD_ALIGN* align; + int i; + int ret = 0; + + /* At least two lines have to be present: The column headers and the line + * with the underlines. */ + MD_ASSERT(n_lines >= 2); + + align = malloc(col_count * sizeof(MD_ALIGN)); + if(align == NULL) { + MD_LOG("malloc() failed."); + ret = -1; + goto abort; + } + + md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count); + + MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL); + MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH, + lines[0].beg, lines[0].end, align, col_count)); + MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL); + + MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL); + for(i = 2; i < n_lines; i++) { + MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD, + lines[i].beg, lines[i].end, align, col_count)); + } + MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL); + +abort: + free(align); + return ret; +} + + +/************************** + *** Processing Block *** + **************************/ + +#define MD_BLOCK_CONTAINER_OPENER 0x01 +#define MD_BLOCK_CONTAINER_CLOSER 0x02 +#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER) +#define MD_BLOCK_LOOSE_LIST 0x04 +#define MD_BLOCK_SETEXT_HEADER 0x08 + +struct MD_BLOCK_tag { + MD_BLOCKTYPE type : 8; + unsigned flags : 8; + + /* MD_BLOCK_H: Header level (1 - 6) + * MD_BLOCK_CODE: Non-zero if fenced, zero if indented. + * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' '). + * MD_BLOCK_TABLE: Column count (as determined by the table underline). + */ + unsigned data : 16; + + /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block. + * MD_BLOCK_LI: Task mark offset in the input doc. + * MD_BLOCK_OL: Start item number. + */ + unsigned n_lines; +}; + +struct MD_CONTAINER_tag { + CHAR ch; + unsigned is_loose : 8; + unsigned is_task : 8; + unsigned start; + unsigned mark_indent; + unsigned contents_indent; + OFF block_byte_off; + OFF task_mark_off; +}; + + +static int +md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + int i; + int ret; + + MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE)); + MD_CHECK(md_process_inlines(ctx, lines, n_lines)); + +abort: + /* Free any temporary memory blocks stored within some dummy marks. */ + for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next) + free(md_mark_get_ptr(ctx, i)); + PTR_CHAIN.head = -1; + PTR_CHAIN.tail = -1; + + return ret; +} + +static int +md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines) +{ + static const CHAR indent_chunk_str[] = _T(" "); + static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1; + + int i; + int ret = 0; + + for(i = 0; i < n_lines; i++) { + const MD_VERBATIMLINE* line = &lines[i]; + int indent = line->indent; + + MD_ASSERT(indent >= 0); + + /* Output code indentation. */ + while(indent > (int) SIZEOF_ARRAY(indent_chunk_str)) { + MD_TEXT(text_type, indent_chunk_str, indent_chunk_size); + indent -= SIZEOF_ARRAY(indent_chunk_str); + } + if(indent > 0) + MD_TEXT(text_type, indent_chunk_str, indent); + + /* Output the code line itself. */ + MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg); + + /* Enforce end-of-line. */ + MD_TEXT(text_type, _T("\n"), 1); + } + +abort: + return ret; +} + +static int +md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines) +{ + if(is_fenced) { + /* Skip the first line in case of fenced code: It is the fence. + * (Only the starting fence is present due to logic in md_analyze_line().) */ + lines++; + n_lines--; + } else { + /* Ignore blank lines at start/end of indented code block. */ + while(n_lines > 0 && lines[0].beg == lines[0].end) { + lines++; + n_lines--; + } + while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) { + n_lines--; + } + } + + if(n_lines == 0) + return 0; + + return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines); +} + +static int +md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det, + MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build) +{ + const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1); + OFF beg = fence_line->beg; + OFF end = fence_line->end; + OFF lang_end; + CHAR fence_ch = CH(fence_line->beg); + int ret = 0; + + /* Skip the fence itself. */ + while(beg < ctx->size && CH(beg) == fence_ch) + beg++; + /* Trim initial spaces. */ + while(beg < ctx->size && CH(beg) == _T(' ')) + beg++; + + /* Trim trailing spaces. */ + while(end > beg && CH(end-1) == _T(' ')) + end--; + + /* Build info string attribute. */ + MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build)); + + /* Build info string attribute. */ + lang_end = beg; + while(lang_end < end && !ISWHITESPACE(lang_end)) + lang_end++; + MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build)); + + det->fence_char = fence_ch; + +abort: + return ret; +} + +static int +md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) +{ + union { + MD_BLOCK_H_DETAIL header; + MD_BLOCK_CODE_DETAIL code; + } det; + MD_ATTRIBUTE_BUILD info_build; + MD_ATTRIBUTE_BUILD lang_build; + int is_in_tight_list; + int clean_fence_code_detail = FALSE; + int ret = 0; + + memset(&det, 0, sizeof(det)); + + if(ctx->n_containers == 0) + is_in_tight_list = FALSE; + else + is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose; + + switch(block->type) { + case MD_BLOCK_H: + det.header.level = block->data; + break; + + case MD_BLOCK_CODE: + /* For fenced code block, we may need to set the info string. */ + if(block->data != 0) { + memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL)); + clean_fence_code_detail = TRUE; + MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build)); + } + break; + + default: + /* Noop. */ + break; + } + + if(!is_in_tight_list || block->type != MD_BLOCK_P) + MD_ENTER_BLOCK(block->type, (void*) &det); + + /* Process the block contents accordingly to is type. */ + switch(block->type) { + case MD_BLOCK_HR: + /* noop */ + break; + + case MD_BLOCK_CODE: + MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0), + (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); + break; + + case MD_BLOCK_HTML: + MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML, + (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); + break; + + case MD_BLOCK_TABLE: + MD_CHECK(md_process_table_block_contents(ctx, block->data, + (const MD_LINE*)(block + 1), block->n_lines)); + break; + + default: + MD_CHECK(md_process_normal_block_contents(ctx, + (const MD_LINE*)(block + 1), block->n_lines)); + break; + } + + if(!is_in_tight_list || block->type != MD_BLOCK_P) + MD_LEAVE_BLOCK(block->type, (void*) &det); + +abort: + if(clean_fence_code_detail) { + md_free_attribute(ctx, &info_build); + md_free_attribute(ctx, &lang_build); + } + return ret; +} + +static int +md_process_all_blocks(MD_CTX* ctx) +{ + int byte_off = 0; + int ret = 0; + + /* ctx->containers now is not needed for detection of lists and list items + * so we reuse it for tracking what lists are loose or tight. We rely + * on the fact the vector is large enough to hold the deepest nesting + * level of lists. */ + ctx->n_containers = 0; + + while(byte_off < ctx->n_block_bytes) { + MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off); + union { + MD_BLOCK_UL_DETAIL ul; + MD_BLOCK_OL_DETAIL ol; + MD_BLOCK_LI_DETAIL li; + } det; + + switch(block->type) { + case MD_BLOCK_UL: + det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE; + det.ul.mark = (CHAR) block->data; + break; + + case MD_BLOCK_OL: + det.ol.start = block->n_lines; + det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE; + det.ol.mark_delimiter = (CHAR) block->data; + break; + + case MD_BLOCK_LI: + det.li.is_task = (block->data != 0); + det.li.task_mark = (CHAR) block->data; + det.li.task_mark_offset = (OFF) block->n_lines; + break; + + default: + /* noop */ + break; + } + + if(block->flags & MD_BLOCK_CONTAINER) { + if(block->flags & MD_BLOCK_CONTAINER_CLOSER) { + MD_LEAVE_BLOCK(block->type, &det); + + if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE) + ctx->n_containers--; + } + + if(block->flags & MD_BLOCK_CONTAINER_OPENER) { + MD_ENTER_BLOCK(block->type, &det); + + if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) { + ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST); + ctx->n_containers++; + } else if(block->type == MD_BLOCK_QUOTE) { + /* This causes that any text in a block quote, even if + * nested inside a tight list item, is wrapped with + *

        ...

        . */ + ctx->containers[ctx->n_containers].is_loose = TRUE; + ctx->n_containers++; + } + } + } else { + MD_CHECK(md_process_leaf_block(ctx, block)); + + if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML) + byte_off += block->n_lines * sizeof(MD_VERBATIMLINE); + else + byte_off += block->n_lines * sizeof(MD_LINE); + } + + byte_off += sizeof(MD_BLOCK); + } + + ctx->n_block_bytes = 0; + +abort: + return ret; +} + + +/************************************ + *** Grouping Lines into Blocks *** + ************************************/ + +static void* +md_push_block_bytes(MD_CTX* ctx, int n_bytes) +{ + void* ptr; + + if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) { + void* new_block_bytes; + + ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0 + ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2 + : 512); + new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes); + if(new_block_bytes == NULL) { + MD_LOG("realloc() failed."); + return NULL; + } + + /* Fix the ->current_block after the reallocation. */ + if(ctx->current_block != NULL) { + OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes; + ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block); + } + + ctx->block_bytes = new_block_bytes; + } + + ptr = (char*)ctx->block_bytes + ctx->n_block_bytes; + ctx->n_block_bytes += n_bytes; + return ptr; +} + +static int +md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line) +{ + MD_BLOCK* block; + + MD_ASSERT(ctx->current_block == NULL); + + block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK)); + if(block == NULL) + return -1; + + switch(line->type) { + case MD_LINE_HR: + block->type = MD_BLOCK_HR; + break; + + case MD_LINE_ATXHEADER: + case MD_LINE_SETEXTHEADER: + block->type = MD_BLOCK_H; + break; + + case MD_LINE_FENCEDCODE: + case MD_LINE_INDENTEDCODE: + block->type = MD_BLOCK_CODE; + break; + + case MD_LINE_TEXT: + block->type = MD_BLOCK_P; + break; + + case MD_LINE_HTML: + block->type = MD_BLOCK_HTML; + break; + + case MD_LINE_BLANK: + case MD_LINE_SETEXTUNDERLINE: + case MD_LINE_TABLEUNDERLINE: + default: + MD_UNREACHABLE(); + break; + } + + block->flags = 0; + block->data = line->data; + block->n_lines = 0; + + ctx->current_block = block; + return 0; +} + +/* Eat from start of current (textual) block any reference definitions and + * remember them so we can resolve any links referring to them. + * + * (Reference definitions can only be at start of it as they cannot break + * a paragraph.) + */ +static int +md_consume_link_reference_definitions(MD_CTX* ctx) +{ + MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); + int n_lines = ctx->current_block->n_lines; + int n = 0; + + /* Compute how many lines at the start of the block form one or more + * reference definitions. */ + while(n < n_lines) { + int n_link_ref_lines; + + n_link_ref_lines = md_is_link_reference_definition(ctx, + lines + n, n_lines - n); + /* Not a reference definition? */ + if(n_link_ref_lines == 0) + break; + + /* We fail if it is the ref. def. but it could not be stored due + * a memory allocation error. */ + if(n_link_ref_lines < 0) + return -1; + + n += n_link_ref_lines; + } + + /* If there was at least one reference definition, we need to remove + * its lines from the block, or perhaps even the whole block. */ + if(n > 0) { + if(n == n_lines) { + /* Remove complete block. */ + ctx->n_block_bytes -= n * sizeof(MD_LINE); + ctx->n_block_bytes -= sizeof(MD_BLOCK); + ctx->current_block = NULL; + } else { + /* Remove just some initial lines from the block. */ + memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE)); + ctx->current_block->n_lines -= n; + ctx->n_block_bytes -= n * sizeof(MD_LINE); + } + } + + return 0; +} + +static int +md_end_current_block(MD_CTX* ctx) +{ + int ret = 0; + + if(ctx->current_block == NULL) + return ret; + + /* Check whether there is a reference definition. (We do this here instead + * of in md_analyze_line() because reference definition can take multiple + * lines.) */ + if(ctx->current_block->type == MD_BLOCK_P || + (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER))) + { + MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); + if(CH(lines[0].beg) == _T('[')) { + MD_CHECK(md_consume_link_reference_definitions(ctx)); + if(ctx->current_block == NULL) + return ret; + } + } + + if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) { + int n_lines = ctx->current_block->n_lines; + + if(n_lines > 1) { + /* Get rid of the underline. */ + ctx->current_block->n_lines--; + ctx->n_block_bytes -= sizeof(MD_LINE); + } else { + /* Only the underline has left after eating the ref. defs. + * Keep the line as beginning of a new ordinary paragraph. */ + ctx->current_block->type = MD_BLOCK_P; + return 0; + } + } + + /* Mark we are not building any block anymore. */ + ctx->current_block = NULL; + +abort: + return ret; +} + +static int +md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis) +{ + MD_ASSERT(ctx->current_block != NULL); + + if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) { + MD_VERBATIMLINE* line; + + line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE)); + if(line == NULL) + return -1; + + line->indent = analysis->indent; + line->beg = analysis->beg; + line->end = analysis->end; + } else { + MD_LINE* line; + + line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE)); + if(line == NULL) + return -1; + + line->beg = analysis->beg; + line->end = analysis->end; + } + ctx->current_block->n_lines++; + + return 0; +} + +static int +md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start, + unsigned data, unsigned flags) +{ + MD_BLOCK* block; + int ret = 0; + + MD_CHECK(md_end_current_block(ctx)); + + block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK)); + if(block == NULL) + return -1; + + block->type = type; + block->flags = flags; + block->data = data; + block->n_lines = start; + +abort: + return ret; +} + + + +/*********************** + *** Line Analysis *** + ***********************/ + +static int +md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer) +{ + OFF off = beg + 1; + int n = 1; + + while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) { + if(CH(off) == CH(beg)) + n++; + off++; + } + + if(n < 3) { + *p_killer = off; + return FALSE; + } + + /* Nothing else can be present on the line. */ + if(off < ctx->size && !ISNEWLINE(off)) { + *p_killer = off; + return FALSE; + } + + *p_end = off; + return TRUE; +} + +static int +md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level) +{ + int n; + OFF off = beg + 1; + + while(off < ctx->size && CH(off) == _T('#') && off - beg < 7) + off++; + n = off - beg; + + if(n > 6) + return FALSE; + *p_level = n; + + if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size && + CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off)) + return FALSE; + + while(off < ctx->size && CH(off) == _T(' ')) + off++; + *p_beg = off; + *p_end = off; + return TRUE; +} + +static int +md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level) +{ + OFF off = beg + 1; + + while(off < ctx->size && CH(off) == CH(beg)) + off++; + + /* Optionally, space(s) can follow. */ + while(off < ctx->size && CH(off) == _T(' ')) + off++; + + /* But nothing more is allowed on the line. */ + if(off < ctx->size && !ISNEWLINE(off)) + return FALSE; + + *p_level = (CH(beg) == _T('=') ? 1 : 2); + *p_end = off; + return TRUE; +} + +static int +md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count) +{ + OFF off = beg; + int found_pipe = FALSE; + unsigned col_count = 0; + + if(off < ctx->size && CH(off) == _T('|')) { + found_pipe = TRUE; + off++; + while(off < ctx->size && ISWHITESPACE(off)) + off++; + } + + while(1) { + OFF cell_beg; + int delimited = FALSE; + + /* Cell underline ("-----", ":----", "----:" or ":----:") */ + cell_beg = off; + if(off < ctx->size && CH(off) == _T(':')) + off++; + while(off < ctx->size && CH(off) == _T('-')) + off++; + if(off < ctx->size && CH(off) == _T(':')) + off++; + if(off - cell_beg < 3) + return FALSE; + + col_count++; + + /* Pipe delimiter (optional at the end of line). */ + while(off < ctx->size && ISWHITESPACE(off)) + off++; + if(off < ctx->size && CH(off) == _T('|')) { + delimited = TRUE; + found_pipe = TRUE; + off++; + while(off < ctx->size && ISWHITESPACE(off)) + off++; + } + + /* Success, if we reach end of line. */ + if(off >= ctx->size || ISNEWLINE(off)) + break; + + if(!delimited) + return FALSE; + } + + if(!found_pipe) + return FALSE; + + *p_end = off; + *p_col_count = col_count; + return TRUE; +} + +static int +md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) +{ + OFF off = beg; + + while(off < ctx->size && CH(off) == CH(beg)) + off++; + + /* Fence must have at least three characters. */ + if(off - beg < 3) + return FALSE; + + ctx->code_fence_length = off - beg; + + /* Optionally, space(s) can follow. */ + while(off < ctx->size && CH(off) == _T(' ')) + off++; + + /* Optionally, an info string can follow. */ + while(off < ctx->size && !ISNEWLINE(off)) { + /* Backtick-based fence must not contain '`' in the info string. */ + if(CH(beg) == _T('`') && CH(off) == _T('`')) + return FALSE; + off++; + } + + *p_end = off; + return TRUE; +} + +static int +md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end) +{ + OFF off = beg; + int ret = FALSE; + + /* Closing fence must have at least the same length and use same char as + * opening one. */ + while(off < ctx->size && CH(off) == ch) + off++; + if(off - beg < ctx->code_fence_length) + goto out; + + /* Optionally, space(s) can follow */ + while(off < ctx->size && CH(off) == _T(' ')) + off++; + + /* But nothing more is allowed on the line. */ + if(off < ctx->size && !ISNEWLINE(off)) + goto out; + + ret = TRUE; + +out: + /* Note we set *p_end even on failure: If we are not closing fence, caller + * would eat the line anyway without any parsing. */ + *p_end = off; + return ret; +} + +/* Returns type of the raw HTML block, or FALSE if it is not HTML block. + * (Refer to CommonMark specification for details about the types.) + */ +static int +md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) +{ + typedef struct TAG_tag TAG; + struct TAG_tag { + const CHAR* name; + unsigned len : 8; + }; + + /* Type 6 is started by a long list of allowed tags. We use two-level + * tree to speed-up the search. */ +#ifdef X + #undef X +#endif +#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) } +#define Xend { NULL, 0 } + static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend }; + + static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend }; + static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend }; + static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend }; + static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"), + X("div"), X("dl"), X("dt"), Xend }; + static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"), + X("form"), X("frame"), X("frameset"), Xend }; + static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend }; + static const TAG i6[] = { X("iframe"), Xend }; + static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend }; + static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend }; + static const TAG n6[] = { X("nav"), X("noframes"), Xend }; + static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend }; + static const TAG p6[] = { X("p"), X("param"), Xend }; + static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend }; + static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"), + X("thead"), X("title"), X("tr"), X("track"), Xend }; + static const TAG u6[] = { X("ul"), Xend }; + static const TAG xx[] = { Xend }; +#undef X + + static const TAG* map6[26] = { + a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6, + n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx + }; + OFF off = beg + 1; + int i; + + /* Check for type 1: size) { + if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len)) + return 1; + } + } + + /* Check for type 2: "), 3, p_end) ? 2 : FALSE); + + case 3: + return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE); + + case 4: + return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE); + + case 5: + return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE); + + case 6: /* Pass through */ + case 7: + *p_end = beg; + return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE); + + default: + MD_UNREACHABLE(); + } + return FALSE; +} + + +static int +md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container) +{ + /* Block quote has no "items" like lists. */ + if(container->ch == _T('>')) + return FALSE; + + if(container->ch != pivot->ch) + return FALSE; + if(container->mark_indent > pivot->contents_indent) + return FALSE; + + return TRUE; +} + +static int +md_push_container(MD_CTX* ctx, const MD_CONTAINER* container) +{ + if(ctx->n_containers >= ctx->alloc_containers) { + MD_CONTAINER* new_containers; + + ctx->alloc_containers = (ctx->alloc_containers > 0 + ? ctx->alloc_containers + ctx->alloc_containers / 2 + : 16); + new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER)); + if(new_containers == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->containers = new_containers; + } + + memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER)); + return 0; +} + +static int +md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data) +{ + int i; + int ret = 0; + + for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) { + MD_CONTAINER* c = &ctx->containers[i]; + int is_ordered_list = FALSE; + + switch(c->ch) { + case _T(')'): + case _T('.'): + is_ordered_list = TRUE; + /* Pass through */ + + case _T('-'): + case _T('+'): + case _T('*'): + /* Remember offset in ctx->block_bytes so we can revisit the + * block if we detect it is a loose list. */ + md_end_current_block(ctx); + c->block_byte_off = ctx->n_block_bytes; + + MD_CHECK(md_push_container_bytes(ctx, + (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), + c->start, data, MD_BLOCK_CONTAINER_OPENER)); + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, + c->task_mark_off, + (c->is_task ? CH(c->task_mark_off) : 0), + MD_BLOCK_CONTAINER_OPENER)); + break; + + case _T('>'): + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER)); + break; + + default: + MD_UNREACHABLE(); + break; + } + } + +abort: + return ret; +} + +static int +md_leave_child_containers(MD_CTX* ctx, int n_keep) +{ + int ret = 0; + + while(ctx->n_containers > n_keep) { + MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1]; + int is_ordered_list = FALSE; + + switch(c->ch) { + case _T(')'): + case _T('.'): + is_ordered_list = TRUE; + /* Pass through */ + + case _T('-'): + case _T('+'): + case _T('*'): + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, + c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0), + MD_BLOCK_CONTAINER_CLOSER)); + MD_CHECK(md_push_container_bytes(ctx, + (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0, + c->ch, MD_BLOCK_CONTAINER_CLOSER)); + break; + + case _T('>'): + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, + 0, MD_BLOCK_CONTAINER_CLOSER)); + break; + + default: + MD_UNREACHABLE(); + break; + } + + ctx->n_containers--; + } + +abort: + return ret; +} + +static int +md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container) +{ + OFF off = beg; + OFF max_end; + + if(indent >= ctx->code_indent_offset) + return FALSE; + + /* Check for block quote mark. */ + if(off < ctx->size && CH(off) == _T('>')) { + off++; + p_container->ch = _T('>'); + p_container->is_loose = FALSE; + p_container->is_task = FALSE; + p_container->mark_indent = indent; + p_container->contents_indent = indent + 1; + *p_end = off; + return TRUE; + } + + /* Check for list item bullet mark. */ + if(off+1 < ctx->size && ISANYOF(off, _T("-+*")) && (ISBLANK(off+1) || ISNEWLINE(off+1))) { + p_container->ch = CH(off); + p_container->is_loose = FALSE; + p_container->is_task = FALSE; + p_container->mark_indent = indent; + p_container->contents_indent = indent + 1; + *p_end = off + 1; + return TRUE; + } + + /* Check for ordered list item marks. */ + max_end = off + 9; + if(max_end > ctx->size) + max_end = ctx->size; + p_container->start = 0; + while(off < max_end && ISDIGIT(off)) { + p_container->start = p_container->start * 10 + CH(off) - _T('0'); + off++; + } + if(off > beg && off+1 < ctx->size && + (CH(off) == _T('.') || CH(off) == _T(')')) && + (ISBLANK(off+1) || ISNEWLINE(off+1))) + { + p_container->ch = CH(off); + p_container->is_loose = FALSE; + p_container->is_task = FALSE; + p_container->mark_indent = indent; + p_container->contents_indent = indent + off - beg + 1; + *p_end = off + 1; + return TRUE; + } + + return FALSE; +} + +static unsigned +md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end) +{ + OFF off = beg; + unsigned indent = total_indent; + + while(off < ctx->size && ISBLANK(off)) { + if(CH(off) == _T('\t')) + indent = (indent + 4) & ~3; + else + indent++; + off++; + } + + *p_end = off; + return indent - total_indent; +} + +static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0 }; + +/* Analyze type of the line and find some its properties. This serves as a + * main input for determining type and boundaries of a block. */ +static int +md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, + const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line) +{ + unsigned total_indent = 0; + int n_parents = 0; + int n_brothers = 0; + int n_children = 0; + MD_CONTAINER container = { 0 }; + int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect; + OFF off = beg; + OFF hr_killer = 0; + int ret = 0; + + line->indent = md_line_indentation(ctx, total_indent, off, &off); + total_indent += line->indent; + line->beg = off; + + /* Given the indentation and block quote marks '>', determine how many of + * the current containers are our parents. */ + while(n_parents < ctx->n_containers) { + MD_CONTAINER* c = &ctx->containers[n_parents]; + + if(c->ch == _T('>') && line->indent < ctx->code_indent_offset && + off < ctx->size && CH(off) == _T('>')) + { + /* Block quote mark. */ + off++; + total_indent++; + line->indent = md_line_indentation(ctx, total_indent, off, &off); + total_indent += line->indent; + + /* The optional 1st space after '>' is part of the block quote mark. */ + if(line->indent > 0) + line->indent--; + + line->beg = off; + + } else if(c->ch != _T('>') && line->indent >= c->contents_indent) { + /* List. */ + line->indent -= c->contents_indent; + } else { + break; + } + + n_parents++; + } + + if(off >= ctx->size || ISNEWLINE(off)) { + /* Blank line does not need any real indentation to be nested inside + * a list. */ + if(n_brothers + n_children == 0) { + while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>')) + n_parents++; + } + } + + while(TRUE) { + /* Check whether we are fenced code continuation. */ + if(pivot_line->type == MD_LINE_FENCEDCODE) { + line->beg = off; + + /* We are another MD_LINE_FENCEDCODE unless we are closing fence + * which we transform into MD_LINE_BLANK. */ + if(line->indent < ctx->code_indent_offset) { + if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) { + line->type = MD_LINE_BLANK; + ctx->last_line_has_list_loosening_effect = FALSE; + break; + } + } + + /* Change indentation accordingly to the initial code fence. */ + if(n_parents == ctx->n_containers) { + if(line->indent > pivot_line->indent) + line->indent -= pivot_line->indent; + else + line->indent = 0; + + line->type = MD_LINE_FENCEDCODE; + break; + } + } + + /* Check whether we are HTML block continuation. */ + if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) { + int html_block_type; + + html_block_type = md_is_html_block_end_condition(ctx, off, &off); + if(html_block_type > 0) { + MD_ASSERT(html_block_type == ctx->html_block_type); + + /* Make sure this is the last line of the block. */ + ctx->html_block_type = 0; + + /* Some end conditions serve as blank lines at the same time. */ + if(html_block_type == 6 || html_block_type == 7) { + line->type = MD_LINE_BLANK; + line->indent = 0; + break; + } + } + + if(n_parents == ctx->n_containers) { + line->type = MD_LINE_HTML; + break; + } + } + + /* Check for blank line. */ + if(off >= ctx->size || ISNEWLINE(off)) { + if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) { + line->type = MD_LINE_INDENTEDCODE; + if(line->indent > ctx->code_indent_offset) + line->indent -= ctx->code_indent_offset; + else + line->indent = 0; + ctx->last_line_has_list_loosening_effect = FALSE; + } else { + line->type = MD_LINE_BLANK; + ctx->last_line_has_list_loosening_effect = (n_parents > 0 && + n_brothers + n_children == 0 && + ctx->containers[n_parents-1].ch != _T('>')); + + #if 1 + /* See https://github.com/mity/md4c/issues/6 + * + * This ugly checking tests we are in (yet empty) list item but not + * its very first line (with the list item mark). + * + * If we are such blank line, then any following non-blank line + * which would be part of this list item actually ends the list + * because "a list item can begin with at most one blank line." + */ + if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') && + n_brothers + n_children == 0 && ctx->current_block == NULL && + ctx->n_block_bytes > (int) sizeof(MD_BLOCK)) + { + MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK)); + if(top_block->type == MD_BLOCK_LI) + ctx->last_list_item_starts_with_two_blank_lines = TRUE; + } + #endif + } + break; + } else { + #if 1 + /* This is 2nd half of the hack. If the flag is set (that is there + * were 2nd blank line at the start of the list item) and we would also + * belonging to such list item, then interrupt the list. */ + ctx->last_line_has_list_loosening_effect = FALSE; + if(ctx->last_list_item_starts_with_two_blank_lines) { + if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') && + n_brothers + n_children == 0 && ctx->current_block == NULL && + ctx->n_block_bytes > (int) sizeof(MD_BLOCK)) + { + MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK)); + if(top_block->type == MD_BLOCK_LI) + n_parents--; + } + + ctx->last_list_item_starts_with_two_blank_lines = FALSE; + } + #endif + } + + /* Check whether we are Setext underline. */ + if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT + && (CH(off) == _T('=') || CH(off) == _T('-')) + && (n_parents == ctx->n_containers)) + { + unsigned level; + + if(md_is_setext_underline(ctx, off, &off, &level)) { + line->type = MD_LINE_SETEXTUNDERLINE; + line->data = level; + break; + } + } + + /* Check for thematic break line. */ + if(line->indent < ctx->code_indent_offset && ISANYOF(off, _T("-_*")) && off >= hr_killer) { + if(md_is_hr_line(ctx, off, &off, &hr_killer)) { + line->type = MD_LINE_HR; + break; + } + } + + /* Check for "brother" container. I.e. whether we are another list item + * in already started list. */ + if(n_parents < ctx->n_containers && n_brothers + n_children == 0) { + OFF tmp; + + if(md_is_container_mark(ctx, line->indent, off, &tmp, &container) && + md_is_container_compatible(&ctx->containers[n_parents], &container)) + { + pivot_line = &md_dummy_blank_line; + + off = tmp; + + total_indent += container.contents_indent - container.mark_indent; + line->indent = md_line_indentation(ctx, total_indent, off, &off); + total_indent += line->indent; + line->beg = off; + + /* Some of the following whitespace actually still belongs to the mark. */ + if(off >= ctx->size || ISNEWLINE(off)) { + container.contents_indent++; + } else if(line->indent <= ctx->code_indent_offset) { + container.contents_indent += line->indent; + line->indent = 0; + } else { + container.contents_indent += 1; + line->indent--; + } + + ctx->containers[n_parents].mark_indent = container.mark_indent; + ctx->containers[n_parents].contents_indent = container.contents_indent; + + n_brothers++; + continue; + } + } + + /* Check for indented code. + * Note indented code block cannot interrupt a paragraph. */ + if(line->indent >= ctx->code_indent_offset && + (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE)) + { + line->type = MD_LINE_INDENTEDCODE; + MD_ASSERT(line->indent >= ctx->code_indent_offset); + line->indent -= ctx->code_indent_offset; + line->data = 0; + break; + } + + /* Check for start of a new container block. */ + if(line->indent < ctx->code_indent_offset && + md_is_container_mark(ctx, line->indent, off, &off, &container)) + { + if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers && + (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>')) + { + /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */ + } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers && + (container.ch == _T('.') || container.ch == _T(')')) && container.start != 1) + { + /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */ + } else { + total_indent += container.contents_indent - container.mark_indent; + line->indent = md_line_indentation(ctx, total_indent, off, &off); + total_indent += line->indent; + + line->beg = off; + line->data = container.ch; + + /* Some of the following whitespace actually still belongs to the mark. */ + if(off >= ctx->size || ISNEWLINE(off)) { + container.contents_indent++; + } else if(line->indent <= ctx->code_indent_offset) { + container.contents_indent += line->indent; + line->indent = 0; + } else { + container.contents_indent += 1; + line->indent--; + } + + if(n_brothers + n_children == 0) + pivot_line = &md_dummy_blank_line; + + if(n_children == 0) + MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers)); + + n_children++; + MD_CHECK(md_push_container(ctx, &container)); + continue; + } + } + + /* Check whether we are table continuation. */ + if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) { + line->type = MD_LINE_TABLE; + break; + } + + /* Check for ATX header. */ + if(line->indent < ctx->code_indent_offset && CH(off) == _T('#')) { + unsigned level; + + if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) { + line->type = MD_LINE_ATXHEADER; + line->data = level; + break; + } + } + + /* Check whether we are starting code fence. */ + if(CH(off) == _T('`') || CH(off) == _T('~')) { + if(md_is_opening_code_fence(ctx, off, &off)) { + line->type = MD_LINE_FENCEDCODE; + line->data = 1; + break; + } + } + + /* Check for start of raw HTML block. */ + if(CH(off) == _T('<') && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS)) + { + ctx->html_block_type = md_is_html_block_start_condition(ctx, off); + + /* HTML block type 7 cannot interrupt paragraph. */ + if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT) + ctx->html_block_type = 0; + + if(ctx->html_block_type > 0) { + /* The line itself also may immediately close the block. */ + if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) { + /* Make sure this is the last line of the block. */ + ctx->html_block_type = 0; + } + + line->type = MD_LINE_HTML; + break; + } + } + + /* Check for table underline. */ + if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT && + (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':')) && + n_parents == ctx->n_containers) + { + unsigned col_count; + + if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 && + md_is_table_underline(ctx, off, &off, &col_count)) + { + line->data = col_count; + line->type = MD_LINE_TABLEUNDERLINE; + break; + } + } + + /* By default, we are normal text line. */ + line->type = MD_LINE_TEXT; + if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) { + /* Lazy continuation. */ + n_parents = ctx->n_containers; + } + + /* Check for task mark. */ + if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 && + ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)"))) + { + OFF tmp = off; + + while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp)) + tmp++; + if(tmp + 2 < ctx->size && CH(tmp) == _T('[') && + ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') && + (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3))) + { + MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container); + task_container->is_task = TRUE; + task_container->task_mark_off = tmp + 1; + off = tmp + 3; + while(ISWHITESPACE(off)) + off++; + line->beg = off; + } + } + + break; + } + + /* Scan for end of the line. + * + * Note this is quite a bottleneck of the parsing as we here iterate almost + * over compete document. + */ +#if defined __linux__ && !defined MD4C_USE_UTF16 + /* Recent glibc versions have superbly optimized strcspn(), even using + * vectorization if available. */ + if(ctx->doc_ends_with_newline && off < ctx->size) { + while(TRUE) { + off += (OFF) strcspn(STR(off), "\r\n"); + + /* strcspn() can stop on zero terminator; but that can appear + * anywhere in the Markfown input... */ + if(CH(off) == _T('\0')) + off++; + else + break; + } + } else +#endif + { + /* Optimization: Use some loop unrolling. */ + while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1) + && !ISNEWLINE(off+2) && !ISNEWLINE(off+3)) + off += 4; + while(off < ctx->size && !ISNEWLINE(off)) + off++; + } + + /* Set end of the line. */ + line->end = off; + + /* But for ATX header, we should exclude the optional trailing mark. */ + if(line->type == MD_LINE_ATXHEADER) { + OFF tmp = line->end; + while(tmp > line->beg && CH(tmp-1) == _T(' ')) + tmp--; + while(tmp > line->beg && CH(tmp-1) == _T('#')) + tmp--; + if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)) + line->end = tmp; + } + + /* Trim trailing spaces. */ + if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) { + while(line->end > line->beg && CH(line->end-1) == _T(' ')) + line->end--; + } + + /* Eat also the new line. */ + if(off < ctx->size && CH(off) == _T('\r')) + off++; + if(off < ctx->size && CH(off) == _T('\n')) + off++; + + *p_end = off; + + /* If we belong to a list after seeing a blank line, the list is loose. */ + if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) { + MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1]; + if(c->ch != _T('>')) { + MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off); + block->flags |= MD_BLOCK_LOOSE_LIST; + } + } + + /* Leave any containers we are not part of anymore. */ + if(n_children == 0 && n_parents + n_brothers < ctx->n_containers) + MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers)); + + /* Enter any container we found a mark for. */ + if(n_brothers > 0) { + MD_ASSERT(n_brothers == 1); + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, + ctx->containers[n_parents].task_mark_off, + (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0), + MD_BLOCK_CONTAINER_CLOSER)); + MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, + container.task_mark_off, + (container.is_task ? CH(container.task_mark_off) : 0), + MD_BLOCK_CONTAINER_OPENER)); + ctx->containers[n_parents].is_task = container.is_task; + ctx->containers[n_parents].task_mark_off = container.task_mark_off; + } + + if(n_children > 0) + MD_CHECK(md_enter_child_containers(ctx, n_children, line->data)); + +abort: + return ret; +} + +static int +md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line) +{ + const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line; + int ret = 0; + + /* Blank line ends current leaf block. */ + if(line->type == MD_LINE_BLANK) { + MD_CHECK(md_end_current_block(ctx)); + *p_pivot_line = &md_dummy_blank_line; + return 0; + } + + /* Some line types form block on their own. */ + if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) { + MD_CHECK(md_end_current_block(ctx)); + + /* Add our single-line block. */ + MD_CHECK(md_start_new_block(ctx, line)); + MD_CHECK(md_add_line_into_current_block(ctx, line)); + MD_CHECK(md_end_current_block(ctx)); + *p_pivot_line = &md_dummy_blank_line; + return 0; + } + + /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */ + if(line->type == MD_LINE_SETEXTUNDERLINE) { + MD_ASSERT(ctx->current_block != NULL); + ctx->current_block->type = MD_BLOCK_H; + ctx->current_block->data = line->data; + ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER; + MD_CHECK(md_add_line_into_current_block(ctx, line)); + MD_CHECK(md_end_current_block(ctx)); + if(ctx->current_block == NULL) { + *p_pivot_line = &md_dummy_blank_line; + } else { + /* This happens if we have consumed all the body as link ref. defs. + * and downgraded the underline into start of a new paragraph block. */ + line->type = MD_LINE_TEXT; + *p_pivot_line = line; + } + return 0; + } + + /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */ + if(line->type == MD_LINE_TABLEUNDERLINE) { + MD_ASSERT(ctx->current_block != NULL); + MD_ASSERT(ctx->current_block->n_lines == 1); + ctx->current_block->type = MD_BLOCK_TABLE; + ctx->current_block->data = line->data; + MD_ASSERT(pivot_line != &md_dummy_blank_line); + ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE; + MD_CHECK(md_add_line_into_current_block(ctx, line)); + return 0; + } + + /* The current block also ends if the line has different type. */ + if(line->type != pivot_line->type) + MD_CHECK(md_end_current_block(ctx)); + + /* The current line may start a new block. */ + if(ctx->current_block == NULL) { + MD_CHECK(md_start_new_block(ctx, line)); + *p_pivot_line = line; + } + + /* In all other cases the line is just a continuation of the current block. */ + MD_CHECK(md_add_line_into_current_block(ctx, line)); + +abort: + return ret; +} + +static int +md_process_doc(MD_CTX *ctx) +{ + const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line; + MD_LINE_ANALYSIS line_buf[2]; + MD_LINE_ANALYSIS* line = &line_buf[0]; + OFF off = 0; + int ret = 0; + + MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL); + + while(off < ctx->size) { + if(line == pivot_line) + line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]); + + MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line)); + MD_CHECK(md_process_line(ctx, &pivot_line, line)); + } + + md_end_current_block(ctx); + + MD_CHECK(md_build_ref_def_hashtable(ctx)); + + /* Process all blocks. */ + MD_CHECK(md_leave_child_containers(ctx, 0)); + MD_CHECK(md_process_all_blocks(ctx)); + + MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL); + +abort: + +#if 0 + /* Output some memory consumption statistics. */ + { + char buffer[256]; + sprintf(buffer, "Alloced %u bytes for block buffer.", + (unsigned)(ctx->alloc_block_bytes)); + MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for containers buffer.", + (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER))); + MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for marks buffer.", + (unsigned)(ctx->alloc_marks * sizeof(MD_MARK))); + MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for aux. buffer.", + (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR))); + MD_LOG(buffer); + } +#endif + + return ret; +} + + +/******************** + *** Public API *** + ********************/ + +int +md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata) +{ + MD_CTX ctx; + int i; + int ret; + + if(parser->abi_version != 0) { + if(parser->debug_log != NULL) + parser->debug_log("Unsupported abi_version.", userdata); + return -1; + } + + /* Setup context structure. */ + memset(&ctx, 0, sizeof(MD_CTX)); + ctx.text = text; + ctx.size = size; + memcpy(&ctx.parser, parser, sizeof(MD_PARSER)); + ctx.userdata = userdata; + ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4; + md_build_mark_char_map(&ctx); + ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1])); + + /* Reset all unresolved opener mark chains. */ + for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) { + ctx.mark_chains[i].head = -1; + ctx.mark_chains[i].tail = -1; + } + ctx.unresolved_link_head = -1; + ctx.unresolved_link_tail = -1; + + /* All the work. */ + ret = md_process_doc(&ctx); + + /* Clean-up. */ + md_free_ref_defs(&ctx); + md_free_ref_def_hashtable(&ctx); + free(ctx.buffer); + free(ctx.marks); + free(ctx.block_bytes); + free(ctx.containers); + + return ret; +} diff --git a/md4c/md4c.h b/md4c/md4c.h new file mode 100644 index 0000000..c2c4311 --- /dev/null +++ b/md4c/md4c.h @@ -0,0 +1,388 @@ +/* + * MD4C: Markdown parser for C + * (http://github.com/mity/md4c) + * + * Copyright (c) 2016-2020 Martin Mitas + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef MD4C_MARKDOWN_H +#define MD4C_MARKDOWN_H + +#ifdef __cplusplus + extern "C" { +#endif + +#if defined MD4C_USE_UTF16 + /* Magic to support UTF-16. Not that in order to use it, you have to define + * the macro MD4C_USE_UTF16 both when building MD4C as well as when + * including this header in your code. */ + #ifdef _WIN32 + #include + typedef WCHAR MD_CHAR; + #else + #error MD4C_USE_UTF16 is only supported on Windows. + #endif +#else + typedef char MD_CHAR; +#endif + +typedef unsigned MD_SIZE; +typedef unsigned MD_OFFSET; + + +/* Block represents a part of document hierarchy structure like a paragraph + * or list item. + */ +typedef enum MD_BLOCKTYPE { + /* ... */ + MD_BLOCK_DOC = 0, + + /*
        ...
        */ + MD_BLOCK_QUOTE, + + /*
          ...
        + * Detail: Structure MD_BLOCK_UL_DETAIL. */ + MD_BLOCK_UL, + + /*
          ...
        + * Detail: Structure MD_BLOCK_OL_DETAIL. */ + MD_BLOCK_OL, + + /*
      • ...
      • + * Detail: Structure MD_BLOCK_LI_DETAIL. */ + MD_BLOCK_LI, + + /*
        */ + MD_BLOCK_HR, + + /*

        ...

        (for levels up to 6) + * Detail: Structure MD_BLOCK_H_DETAIL. */ + MD_BLOCK_H, + + /*
        ...
        + * Note the text lines within code blocks are terminated with '\n' + * instead of explicit MD_TEXT_BR. */ + MD_BLOCK_CODE, + + /* Raw HTML block. This itself does not correspond to any particular HTML + * tag. The contents of it _is_ raw HTML source intended to be put + * in verbatim form to the HTML output. */ + MD_BLOCK_HTML, + + /*

        ...

        */ + MD_BLOCK_P, + + /* ...
        and its contents. + * Detail: Structure MD_BLOCK_TD_DETAIL (used with MD_BLOCK_TH and MD_BLOCK_TD) + * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */ + MD_BLOCK_TABLE, + MD_BLOCK_THEAD, + MD_BLOCK_TBODY, + MD_BLOCK_TR, + MD_BLOCK_TH, + MD_BLOCK_TD +} MD_BLOCKTYPE; + +/* Span represents an in-line piece of a document which should be rendered with + * the same font, color and other attributes. A sequence of spans forms a block + * like paragraph or list item. */ +typedef enum MD_SPANTYPE { + /* ... */ + MD_SPAN_EM, + + /* ... */ + MD_SPAN_STRONG, + + /* ... + * Detail: Structure MD_SPAN_A_DETAIL. */ + MD_SPAN_A, + + /* ... + * Detail: Structure MD_SPAN_IMG_DETAIL. + * Note: Image text can contain nested spans and even nested images. + * If rendered into ALT attribute of HTML tag, it's responsibility + * of the renderer to deal with it. + */ + MD_SPAN_IMG, + + /* ... */ + MD_SPAN_CODE, + + /* ... + * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled. + */ + MD_SPAN_DEL, + + /* For recognizing inline ($) and display ($$) equations + * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled. + */ + MD_SPAN_LATEXMATH, + MD_SPAN_LATEXMATH_DISPLAY, + + /* Wiki links + * Note: Recognized only when MD_FLAG_WIKILINKS is enabled. + */ + MD_SPAN_WIKILINK, + + /* ... + * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */ + MD_SPAN_U +} MD_SPANTYPE; + +/* Text is the actual textual contents of span. */ +typedef enum MD_TEXTTYPE { + /* Normal text. */ + MD_TEXT_NORMAL = 0, + + /* NULL character. CommonMark requires replacing NULL character with + * the replacement char U+FFFD, so this allows caller to do that easily. */ + MD_TEXT_NULLCHAR, + + /* Line breaks. + * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE + * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */ + MD_TEXT_BR, /*
        (hard break) */ + MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ + + /* Entity. + * (a) Named entity, e.g.   + * (Note MD4C does not have a list of known entities. + * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is + * treated as a named entity.) + * (b) Numerical entity, e.g. Ӓ + * (c) Hexadecimal entity, e.g. ካ + * + * As MD4C is mostly encoding agnostic, application gets the verbatim + * entity text into the MD_RENDERER::text_callback(). */ + MD_TEXT_ENTITY, + + /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`). + * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and + * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this + * kind of text. */ + MD_TEXT_CODE, + + /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not + * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used. + * The text contains verbatim '\n' for the new lines. */ + MD_TEXT_HTML, + + /* Text is inside an equation. This is processed the same way as inlined code + * spans (`code`). */ + MD_TEXT_LATEXMATH +} MD_TEXTTYPE; + + +/* Alignment enumeration. */ +typedef enum MD_ALIGN { + MD_ALIGN_DEFAULT = 0, /* When unspecified. */ + MD_ALIGN_LEFT, + MD_ALIGN_CENTER, + MD_ALIGN_RIGHT +} MD_ALIGN; + + +/* String attribute. + * + * This wraps strings which are outside of a normal text flow and which are + * propagated within various detailed structures, but which still may contain + * string portions of different types like e.g. entities. + * + * So, for example, lets consider an image has a title attribute string + * set to "foo " bar". (Note the string size is 14.) + * + * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following: + * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0) + * -- [1]: """ (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4) + * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10) + * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14) + * + * Note that these conditions are guaranteed: + * -- substr_offsets[0] == 0 + * -- substr_offsets[LAST+1] == size + * -- Only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR substrings can appear. + */ +typedef struct MD_ATTRIBUTE { + const MD_CHAR* text; + MD_SIZE size; + const MD_TEXTTYPE* substr_types; + const MD_OFFSET* substr_offsets; +} MD_ATTRIBUTE; + + +/* Detailed info for MD_BLOCK_UL. */ +typedef struct MD_BLOCK_UL_DETAIL { + int is_tight; /* Non-zero if tight list, zero if loose. */ + MD_CHAR mark; /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */ +} MD_BLOCK_UL_DETAIL; + +/* Detailed info for MD_BLOCK_OL. */ +typedef struct MD_BLOCK_OL_DETAIL { + unsigned start; /* Start index of the ordered list. */ + int is_tight; /* Non-zero if tight list, zero if loose. */ + MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */ +} MD_BLOCK_OL_DETAIL; + +/* Detailed info for MD_BLOCK_LI. */ +typedef struct MD_BLOCK_LI_DETAIL { + int is_task; /* Can be non-zero only with MD_FLAG_TASKLISTS */ + MD_CHAR task_mark; /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */ + MD_OFFSET task_mark_offset; /* If is_task, then offset in the input of the char between '[' and ']'. */ +} MD_BLOCK_LI_DETAIL; + +/* Detailed info for MD_BLOCK_H. */ +typedef struct MD_BLOCK_H_DETAIL { + unsigned level; /* Header level (1 - 6) */ +} MD_BLOCK_H_DETAIL; + +/* Detailed info for MD_BLOCK_CODE. */ +typedef struct MD_BLOCK_CODE_DETAIL { + MD_ATTRIBUTE info; + MD_ATTRIBUTE lang; + MD_CHAR fence_char; /* The character used for fenced code block; or zero for indented code block. */ +} MD_BLOCK_CODE_DETAIL; + +/* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */ +typedef struct MD_BLOCK_TD_DETAIL { + MD_ALIGN align; +} MD_BLOCK_TD_DETAIL; + +/* Detailed info for MD_SPAN_A. */ +typedef struct MD_SPAN_A_DETAIL { + MD_ATTRIBUTE href; + MD_ATTRIBUTE title; +} MD_SPAN_A_DETAIL; + +/* Detailed info for MD_SPAN_IMG. */ +typedef struct MD_SPAN_IMG_DETAIL { + MD_ATTRIBUTE src; + MD_ATTRIBUTE title; +} MD_SPAN_IMG_DETAIL; + +/* Detailed info for MD_SPAN_WIKILINK. */ +typedef struct MD_SPAN_WIKILINK { + MD_ATTRIBUTE target; +} MD_SPAN_WIKILINK_DETAIL; + +/* Flags specifying extensions/deviations from CommonMark specification. + * + * By default (when MD_RENDERER::flags == 0), we follow CommonMark specification. + * The following flags may allow some extensions or deviations from it. + */ +#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ +#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ +#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ +#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ +#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ +#define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ +#define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ +#define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ +#define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ +#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ +#define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */ +#define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ +#define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ +#define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ + +#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) +#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) + +/* Convenient sets of flags corresponding to well-known Markdown dialects. + * + * Note we may only support subset of features of the referred dialect. + * The constant just enables those extensions which bring us as close as + * possible given what features we implement. + * + * ABI compatibility note: Meaning of these can change in time as new + * extensions, bringing the dialect closer to the original, are implemented. + */ +#define MD_DIALECT_COMMONMARK 0 +#define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS) + +/* Renderer structure. + */ +typedef struct MD_PARSER { + /* Reserved. Set to zero. + */ + unsigned abi_version; + + /* Dialect options. Bitmask of MD_FLAG_xxxx values. + */ + unsigned flags; + + /* Caller-provided rendering callbacks. + * + * For some block/span types, more detailed information is provided in a + * type-specific structure pointed by the argument 'detail'. + * + * The last argument of all callbacks, 'userdata', is just propagated from + * md_parse() and is available for any use by the application. + * + * Note any strings provided to the callbacks as their arguments or as + * members of any detail structure are generally not zero-terminated. + * Application has take the respective size information into account. + * + * Callbacks may abort further parsing of the document by returning non-zero. + */ + int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); + int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); + + int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); + int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); + + int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/); + + /* Debug callback. Optional (may be NULL). + * + * If provided and something goes wrong, this function gets called. + * This is intended for debugging and problem diagnosis for developers; + * it is not intended to provide any errors suitable for displaying to an + * end user. + */ + void (*debug_log)(const char* /*msg*/, void* /*userdata*/); + + /* Reserved. Set to NULL. + */ + void (*syntax)(void); +} MD_PARSER; + + +/* For backward compatibility. Do not use in new code. */ +typedef MD_PARSER MD_RENDERER; + + +/* Parse the Markdown document stored in the string 'text' of size 'size'. + * The renderer provides callbacks to be called during the parsing so the + * caller can render the document on the screen or convert the Markdown + * to another format. + * + * Zero is returned on success. If a runtime error occurs (e.g. a memory + * fails), -1 is returned. If the processing is aborted due any callback + * returning non-zero, md_parse() the return value of the callback is returned. + */ +int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata); + + +#ifdef __cplusplus + } /* extern "C" { */ +#endif + +#endif /* MD4C_MARKDOWN_H */ diff --git a/md4c/md4c.pc.in b/md4c/md4c.pc.in new file mode 100644 index 0000000..61c78d8 --- /dev/null +++ b/md4c/md4c.pc.in @@ -0,0 +1,12 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=@CMAKE_INSTALL_PREFIX@ +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: @PROJECT_NAME@ +Description: @PROJECT_DESCRIPTION@ +Version: @PROJECT_VERSION@ + +Requires: +Libs: -L${libdir} -lmd4c +Cflags: -I${includedir} diff --git a/scripts/build_folding_map.py b/scripts/build_folding_map.py new file mode 100644 index 0000000..b27b480 --- /dev/null +++ b/scripts/build_folding_map.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/CaseFolding.txt", "r") + +status_list = [ "C", "F" ] + +folding_list = [ dict(), dict(), dict() ] + +# Filter the foldings for "full" folding. +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3) + if not status.strip() in status_list: + continue + codepoint = int(raw_codepoint.strip(), 16) + mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")] + mapping_len = len(mapping) + + if mapping_len in range(1, 4): + folding_list[mapping_len-1][codepoint] = mapping + else: + assert(False) +f.close() + + +# If we assume that range (index0 ... index-1) makes a range, check that index +# is compatible with it too. +# +# We are capable to handle ranges which: +# +# (1) either form consecutive sequence of codepoints and which map that range +# to other consecutive range of codepoints; +# +# (2) or consecutive range of codepoints with step 2 where each codepoint +# CP is mapped to the next codepoint CP+1 +# (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1238; ...). +# +# (If the mappings have multiple codepoints, only the 1st mapped codepoint is +# considered and all the other ones have to be the same for the whole range.) +def is_range_compatible(folding, codepoint_list, index0, index): + N = index - index0 + codepoint0 = codepoint_list[index0] + codepoint1 = codepoint_list[index0+1] + codepointN = codepoint_list[index] + mapping0 = folding[codepoint0] + mapping1 = folding[codepoint1] + mappingN = folding[codepointN] + + # Check the range type (1): + if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \ + and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \ + and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]: + return True + + # Check the range type (2): + if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \ + and mapping0[0] - codepoint0 == 1 \ + and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \ + and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]: + return True + + return False + + +def mapping_str(list, mapping): + return ",".join("0x{:04x}".format(x) for x in mapping) + +for mapping_len in range(1, 4): + folding = folding_list[mapping_len-1] + codepoint_list = list(folding) + + index0 = 0 + count = len(folding) + + records = list() + data_records = list() + + while index0 < count: + index1 = index0 + 1 + while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1): + index1 += 1 + + if index1 - index0 > 2: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index0]])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]])) + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index0]])) + + index0 = index1 + + sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len)) + sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) + sys.stdout.write("\n};\n") + + sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len)) + sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110, + initial_indent = " ", subsequent_indent=" "))) + sys.stdout.write("\n};\n") + + + diff --git a/scripts/build_punct_map.py b/scripts/build_punct_map.py new file mode 100644 index 0000000..13102f2 --- /dev/null +++ b/scripts/build_punct_map.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r") + +codepoint_list = [] +category_list = [ "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" ] + +# Filter codepoints falling in the right category: +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + char_range, category = line.split(";") + char_range = char_range.strip() + category = category.strip() + + if not category in category_list: + continue + + delim_off = char_range.find("..") + if delim_off >= 0: + codepoint0 = int(char_range[:delim_off], 16) + codepoint1 = int(char_range[delim_off+2:], 16) + for codepoint in range(codepoint0, codepoint1 + 1): + codepoint_list.append(codepoint) + else: + codepoint = int(char_range, 16) + codepoint_list.append(codepoint) +f.close() + + +codepoint_list.sort() + + +index0 = 0 +count = len(codepoint_list) + +records = list() +while index0 < count: + index1 = index0 + 1 + while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1: + index1 += 1 + + if index1 - index0 > 1: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + + index0 = index1 + +sys.stdout.write("static const unsigned PUNCT_MAP[] = {\n") +sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) +sys.stdout.write("\n};\n\n") diff --git a/scripts/build_whitespace_map.py b/scripts/build_whitespace_map.py new file mode 100644 index 0000000..932b571 --- /dev/null +++ b/scripts/build_whitespace_map.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r") + +codepoint_list = [] +category_list = [ "Zs" ] + +# Filter codepoints falling in the right category: +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + char_range, category = line.split(";") + char_range = char_range.strip() + category = category.strip() + + if not category in category_list: + continue + + delim_off = char_range.find("..") + if delim_off >= 0: + codepoint0 = int(char_range[:delim_off], 16) + codepoint1 = int(char_range[delim_off+2:], 16) + for codepoint in range(codepoint0, codepoint1 + 1): + codepoint_list.append(codepoint) + else: + codepoint = int(char_range, 16) + codepoint_list.append(codepoint) +f.close() + + +codepoint_list.sort() + + +index0 = 0 +count = len(codepoint_list) + +records = list() +while index0 < count: + index1 = index0 + 1 + while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1: + index1 += 1 + + if index1 - index0 > 1: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + + index0 = index1 + +sys.stdout.write("static const unsigned WHITESPACE_MAP[] = {\n") +sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) +sys.stdout.write("\n};\n\n") diff --git a/scripts/coverity.sh b/scripts/coverity.sh new file mode 100755 index 0000000..bcf7f14 --- /dev/null +++ b/scripts/coverity.sh @@ -0,0 +1,70 @@ +#!/bin/sh +# +# This scripts attempts to build the project via cov-build utility, and prepare +# a package for uploading to the coverity scan service. +# +# (See http://scan.coverity.com for more info.) + +set -e + +# Check presence of coverity static analyzer. +if ! which cov-build; then + echo "Utility cov-build not found in PATH." + exit 1 +fi + +# Choose a build system (ninja or GNU make). +if which ninja; then + BUILD_TOOL=ninja + GENERATOR=Ninja +elif which make; then + BUILD_TOOL=make + GENERATOR="MSYS Makefiles" +else + echo "No suitable build system found." + exit 1 +fi + +# Choose a zip tool. +if which 7za; then + MKZIP="7za a -r -mx9" +elif which 7z; then + MKZIP="7z a -r -mx9" +elif which zip; then + MKZIP="zip -r" +else + echo "No suitable zip utility found" + exit 1 +fi + +# Change dir to project root. +cd `dirname "$0"`/.. + +CWD=`pwd` +ROOT_DIR="$CWD" +BUILD_DIR="$CWD/coverity" +OUTPUT="$CWD/cov-int.zip" + +# Sanity checks. +if [ ! -x "$ROOT_DIR/scripts/coverity.sh" ]; then + echo "There is some path mismatch." + exit 1 +fi +if [ -e "$BUILD_DIR" ]; then + echo "Path $BUILD_DIR already exists. Delete it and retry." + exit 1 +fi +if [ -e "$OUTPUT" ]; then + echo "Path $OUTPUT already exists. Delete it and retry." + exit 1 +fi + +# Build the project with the Coverity analyzes enabled. +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" +cmake -G "$GENERATOR" "$ROOT_DIR" +cov-build --dir cov-int "$BUILD_TOOL" +$MKZIP "$OUTPUT" "cov-int" +cd "$ROOT_DIR" +rm -rf "$BUILD_DIR" + diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh new file mode 100755 index 0000000..c00b36a --- /dev/null +++ b/scripts/run-tests.sh @@ -0,0 +1,75 @@ +#!/bin/sh +# +# Run this script from build directory. + +#set -e + +SELF_DIR=`dirname $0` +PROJECT_DIR="$SELF_DIR/.." +TEST_DIR="$PROJECT_DIR/test" + + +PROGRAM="md2html/md2html" +if [ ! -x "$PROGRAM" ]; then + echo "Cannot find the $PROGRAM." >&2 + echo "You have to run this script from the build directory." >&2 + exit 1 +fi + +if which py >>/dev/null 2>&1; then + PYTHON=py +elif which python3 >>/dev/null 2>&1; then + PYTHON=python3 +elif which python >>/dev/null 2>&1; then + if [ `python --version | awk '{print $2}' | cut -d. -f1` -ge 3 ]; then + PYTHON=python + fi +fi + +echo +echo "CommonMark specification:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/spec.txt" -p "$PROGRAM" + +echo +echo "Code coverage & regressions:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/coverage.txt" -p "$PROGRAM" + +echo +echo "Permissive e-mail autolinks extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-email-autolinks.txt" -p "$PROGRAM --fpermissive-email-autolinks" + +echo +echo "Permissive URL autolinks extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-url-autolinks.txt" -p "$PROGRAM --fpermissive-url-autolinks" + +echo +echo "WWW autolinks extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-www-autolinks.txt" -p "$PROGRAM --fpermissive-www-autolinks" + +echo +echo "Tables extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/tables.txt" -p "$PROGRAM --ftables" + +echo +echo "Strikethrough extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/strikethrough.txt" -p "$PROGRAM --fstrikethrough" + +echo +echo "Task lists extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/tasklists.txt" -p "$PROGRAM --ftasklists" + +echo +echo "LaTeX extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/latex-math.txt" -p "$PROGRAM --flatex-math" + +echo +echo "Wiki links extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/wiki-links.txt" -p "$PROGRAM --fwiki-links --ftables" + +echo +echo "Underline extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/underline.txt" -p "$PROGRAM --funderline" + +echo +echo "Pathological input:" +$PYTHON "$TEST_DIR/pathological_tests.py" -p "$PROGRAM" diff --git a/scripts/unicode/CaseFolding.txt b/scripts/unicode/CaseFolding.txt new file mode 100644 index 0000000..7eeb915 --- /dev/null +++ b/scripts/unicode/CaseFolding.txt @@ -0,0 +1,1581 @@ +# CaseFolding-12.1.0.txt +# Date: 2019-03-10, 10:53:00 GMT +# © 2019 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# +# All code points not listed in this file map to themselves. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, including how to have case folding +# preserve normalization formats, see Section 3.13 Default Case Algorithms in +# The Unicode Standard. +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# ; ; ; # +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# Note that the Turkic mappings do not maintain canonical equivalence without additional processing. +# See the discussions of case mapping in the Unicode Standard for more information. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +# Property: Case_Folding + +# All code points not explicitly listed for Case_Folding +# have the value C for the status field, and the code point itself for the mapping field. + +# ================================================================= +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +# +# EOF diff --git a/scripts/unicode/DerivedGeneralCategory.txt b/scripts/unicode/DerivedGeneralCategory.txt new file mode 100644 index 0000000..21a66ee --- /dev/null +++ b/scripts/unicode/DerivedGeneralCategory.txt @@ -0,0 +1,4045 @@ +# DerivedGeneralCategory-12.1.0.txt +# Date: 2019-03-10, 10:53:08 GMT +# © 2019 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ + +# ================================================ + +# Property: General_Category + +# ================================================ + +# General_Category=Unassigned + +0378..0379 ; Cn # [2] .. +0380..0383 ; Cn # [4] .. +038B ; Cn # +038D ; Cn # +03A2 ; Cn # +0530 ; Cn # +0557..0558 ; Cn # [2] .. +058B..058C ; Cn # [2] .. +0590 ; Cn # +05C8..05CF ; Cn # [8] .. +05EB..05EE ; Cn # [4] .. +05F5..05FF ; Cn # [11] .. +061D ; Cn # +070E ; Cn # +074B..074C ; Cn # [2] .. +07B2..07BF ; Cn # [14] .. +07FB..07FC ; Cn # [2] .. +082E..082F ; Cn # [2] .. +083F ; Cn # +085C..085D ; Cn # [2] .. +085F ; Cn # +086B..089F ; Cn # [53] .. +08B5 ; Cn # +08BE..08D2 ; Cn # [21] .. +0984 ; Cn # +098D..098E ; Cn # [2] .. +0991..0992 ; Cn # [2] .. +09A9 ; Cn # +09B1 ; Cn # +09B3..09B5 ; Cn # [3] .. +09BA..09BB ; Cn # [2] .. +09C5..09C6 ; Cn # [2] .. +09C9..09CA ; Cn # [2] .. +09CF..09D6 ; Cn # [8] .. +09D8..09DB ; Cn # [4] .. +09DE ; Cn # +09E4..09E5 ; Cn # [2] .. +09FF..0A00 ; Cn # [2] .. +0A04 ; Cn # +0A0B..0A0E ; Cn # [4] .. +0A11..0A12 ; Cn # [2] .. +0A29 ; Cn # +0A31 ; Cn # +0A34 ; Cn # +0A37 ; Cn # +0A3A..0A3B ; Cn # [2] .. +0A3D ; Cn # +0A43..0A46 ; Cn # [4] .. +0A49..0A4A ; Cn # [2] .. +0A4E..0A50 ; Cn # [3] .. +0A52..0A58 ; Cn # [7] .. +0A5D ; Cn # +0A5F..0A65 ; Cn # [7] .. +0A77..0A80 ; Cn # [10] .. +0A84 ; Cn # +0A8E ; Cn # +0A92 ; Cn # +0AA9 ; Cn # +0AB1 ; Cn # +0AB4 ; Cn # +0ABA..0ABB ; Cn # [2] .. +0AC6 ; Cn # +0ACA ; Cn # +0ACE..0ACF ; Cn # [2] .. +0AD1..0ADF ; Cn # [15] .. +0AE4..0AE5 ; Cn # [2] .. +0AF2..0AF8 ; Cn # [7] .. +0B00 ; Cn # +0B04 ; Cn # +0B0D..0B0E ; Cn # [2] .. +0B11..0B12 ; Cn # [2] .. +0B29 ; Cn # +0B31 ; Cn # +0B34 ; Cn # +0B3A..0B3B ; Cn # [2] .. +0B45..0B46 ; Cn # [2] .. +0B49..0B4A ; Cn # [2] .. +0B4E..0B55 ; Cn # [8] .. +0B58..0B5B ; Cn # [4] .. +0B5E ; Cn # +0B64..0B65 ; Cn # [2] .. +0B78..0B81 ; Cn # [10] .. +0B84 ; Cn # +0B8B..0B8D ; Cn # [3] .. +0B91 ; Cn # +0B96..0B98 ; Cn # [3] .. +0B9B ; Cn # +0B9D ; Cn # +0BA0..0BA2 ; Cn # [3] .. +0BA5..0BA7 ; Cn # [3] .. +0BAB..0BAD ; Cn # [3] .. +0BBA..0BBD ; Cn # [4] .. +0BC3..0BC5 ; Cn # [3] .. +0BC9 ; Cn # +0BCE..0BCF ; Cn # [2] .. +0BD1..0BD6 ; Cn # [6] .. +0BD8..0BE5 ; Cn # [14] .. +0BFB..0BFF ; Cn # [5] .. +0C0D ; Cn # +0C11 ; Cn # +0C29 ; Cn # +0C3A..0C3C ; Cn # [3] .. +0C45 ; Cn # +0C49 ; Cn # +0C4E..0C54 ; Cn # [7] .. +0C57 ; Cn # +0C5B..0C5F ; Cn # [5] .. +0C64..0C65 ; Cn # [2] .. +0C70..0C76 ; Cn # [7] .. +0C8D ; Cn # +0C91 ; Cn # +0CA9 ; Cn # +0CB4 ; Cn # +0CBA..0CBB ; Cn # [2] .. +0CC5 ; Cn # +0CC9 ; Cn # +0CCE..0CD4 ; Cn # [7] .. +0CD7..0CDD ; Cn # [7] .. +0CDF ; Cn # +0CE4..0CE5 ; Cn # [2] .. +0CF0 ; Cn # +0CF3..0CFF ; Cn # [13] .. +0D04 ; Cn # +0D0D ; Cn # +0D11 ; Cn # +0D45 ; Cn # +0D49 ; Cn # +0D50..0D53 ; Cn # [4] .. +0D64..0D65 ; Cn # [2] .. +0D80..0D81 ; Cn # [2] .. +0D84 ; Cn # +0D97..0D99 ; Cn # [3] .. +0DB2 ; Cn # +0DBC ; Cn # +0DBE..0DBF ; Cn # [2] .. +0DC7..0DC9 ; Cn # [3] .. +0DCB..0DCE ; Cn # [4] .. +0DD5 ; Cn # +0DD7 ; Cn # +0DE0..0DE5 ; Cn # [6] .. +0DF0..0DF1 ; Cn # [2] .. +0DF5..0E00 ; Cn # [12] .. +0E3B..0E3E ; Cn # [4] .. +0E5C..0E80 ; Cn # [37] .. +0E83 ; Cn # +0E85 ; Cn # +0E8B ; Cn # +0EA4 ; Cn # +0EA6 ; Cn # +0EBE..0EBF ; Cn # [2] .. +0EC5 ; Cn # +0EC7 ; Cn # +0ECE..0ECF ; Cn # [2] .. +0EDA..0EDB ; Cn # [2] .. +0EE0..0EFF ; Cn # [32] .. +0F48 ; Cn # +0F6D..0F70 ; Cn # [4] .. +0F98 ; Cn # +0FBD ; Cn # +0FCD ; Cn # +0FDB..0FFF ; Cn # [37] .. +10C6 ; Cn # +10C8..10CC ; Cn # [5] .. +10CE..10CF ; Cn # [2] .. +1249 ; Cn # +124E..124F ; Cn # [2] .. +1257 ; Cn # +1259 ; Cn # +125E..125F ; Cn # [2] .. +1289 ; Cn # +128E..128F ; Cn # [2] .. +12B1 ; Cn # +12B6..12B7 ; Cn # [2] .. +12BF ; Cn # +12C1 ; Cn # +12C6..12C7 ; Cn # [2] .. +12D7 ; Cn # +1311 ; Cn # +1316..1317 ; Cn # [2] .. +135B..135C ; Cn # [2] .. +137D..137F ; Cn # [3] .. +139A..139F ; Cn # [6] .. +13F6..13F7 ; Cn # [2] .. +13FE..13FF ; Cn # [2] .. +169D..169F ; Cn # [3] .. +16F9..16FF ; Cn # [7] .. +170D ; Cn # +1715..171F ; Cn # [11] .. +1737..173F ; Cn # [9] .. +1754..175F ; Cn # [12] .. +176D ; Cn # +1771 ; Cn # +1774..177F ; Cn # [12] .. +17DE..17DF ; Cn # [2] .. +17EA..17EF ; Cn # [6] .. +17FA..17FF ; Cn # [6] .. +180F ; Cn # +181A..181F ; Cn # [6] .. +1879..187F ; Cn # [7] .. +18AB..18AF ; Cn # [5] .. +18F6..18FF ; Cn # [10] .. +191F ; Cn # +192C..192F ; Cn # [4] .. +193C..193F ; Cn # [4] .. +1941..1943 ; Cn # [3] .. +196E..196F ; Cn # [2] .. +1975..197F ; Cn # [11] .. +19AC..19AF ; Cn # [4] .. +19CA..19CF ; Cn # [6] .. +19DB..19DD ; Cn # [3] .. +1A1C..1A1D ; Cn # [2] .. +1A5F ; Cn # +1A7D..1A7E ; Cn # [2] .. +1A8A..1A8F ; Cn # [6] .. +1A9A..1A9F ; Cn # [6] .. +1AAE..1AAF ; Cn # [2] .. +1ABF..1AFF ; Cn # [65] .. +1B4C..1B4F ; Cn # [4] .. +1B7D..1B7F ; Cn # [3] .. +1BF4..1BFB ; Cn # [8] .. +1C38..1C3A ; Cn # [3] .. +1C4A..1C4C ; Cn # [3] .. +1C89..1C8F ; Cn # [7] .. +1CBB..1CBC ; Cn # [2] .. +1CC8..1CCF ; Cn # [8] .. +1CFB..1CFF ; Cn # [5] .. +1DFA ; Cn # +1F16..1F17 ; Cn # [2] .. +1F1E..1F1F ; Cn # [2] .. +1F46..1F47 ; Cn # [2] .. +1F4E..1F4F ; Cn # [2] .. +1F58 ; Cn # +1F5A ; Cn # +1F5C ; Cn # +1F5E ; Cn # +1F7E..1F7F ; Cn # [2] .. +1FB5 ; Cn # +1FC5 ; Cn # +1FD4..1FD5 ; Cn # [2] .. +1FDC ; Cn # +1FF0..1FF1 ; Cn # [2] .. +1FF5 ; Cn # +1FFF ; Cn # +2065 ; Cn # +2072..2073 ; Cn # [2] .. +208F ; Cn # +209D..209F ; Cn # [3] .. +20C0..20CF ; Cn # [16] .. +20F1..20FF ; Cn # [15] .. +218C..218F ; Cn # [4] .. +2427..243F ; Cn # [25] .. +244B..245F ; Cn # [21] .. +2B74..2B75 ; Cn # [2] .. +2B96..2B97 ; Cn # [2] .. +2C2F ; Cn # +2C5F ; Cn # +2CF4..2CF8 ; Cn # [5] .. +2D26 ; Cn # +2D28..2D2C ; Cn # [5] .. +2D2E..2D2F ; Cn # [2] .. +2D68..2D6E ; Cn # [7] .. +2D71..2D7E ; Cn # [14] .. +2D97..2D9F ; Cn # [9] .. +2DA7 ; Cn # +2DAF ; Cn # +2DB7 ; Cn # +2DBF ; Cn # +2DC7 ; Cn # +2DCF ; Cn # +2DD7 ; Cn # +2DDF ; Cn # +2E50..2E7F ; Cn # [48] .. +2E9A ; Cn # +2EF4..2EFF ; Cn # [12] .. +2FD6..2FEF ; Cn # [26] .. +2FFC..2FFF ; Cn # [4] .. +3040 ; Cn # +3097..3098 ; Cn # [2] .. +3100..3104 ; Cn # [5] .. +3130 ; Cn # +318F ; Cn # +31BB..31BF ; Cn # [5] .. +31E4..31EF ; Cn # [12] .. +321F ; Cn # +4DB6..4DBF ; Cn # [10] .. +9FF0..9FFF ; Cn # [16] .. +A48D..A48F ; Cn # [3] .. +A4C7..A4CF ; Cn # [9] .. +A62C..A63F ; Cn # [20] .. +A6F8..A6FF ; Cn # [8] .. +A7C0..A7C1 ; Cn # [2] .. +A7C7..A7F6 ; Cn # [48] .. +A82C..A82F ; Cn # [4] .. +A83A..A83F ; Cn # [6] .. +A878..A87F ; Cn # [8] .. +A8C6..A8CD ; Cn # [8] .. +A8DA..A8DF ; Cn # [6] .. +A954..A95E ; Cn # [11] .. +A97D..A97F ; Cn # [3] .. +A9CE ; Cn # +A9DA..A9DD ; Cn # [4] .. +A9FF ; Cn # +AA37..AA3F ; Cn # [9] .. +AA4E..AA4F ; Cn # [2] .. +AA5A..AA5B ; Cn # [2] .. +AAC3..AADA ; Cn # [24] .. +AAF7..AB00 ; Cn # [10] .. +AB07..AB08 ; Cn # [2] .. +AB0F..AB10 ; Cn # [2] .. +AB17..AB1F ; Cn # [9] .. +AB27 ; Cn # +AB2F ; Cn # +AB68..AB6F ; Cn # [8] .. +ABEE..ABEF ; Cn # [2] .. +ABFA..ABFF ; Cn # [6] .. +D7A4..D7AF ; Cn # [12] .. +D7C7..D7CA ; Cn # [4] .. +D7FC..D7FF ; Cn # [4] .. +FA6E..FA6F ; Cn # [2] .. +FADA..FAFF ; Cn # [38] .. +FB07..FB12 ; Cn # [12] .. +FB18..FB1C ; Cn # [5] .. +FB37 ; Cn # +FB3D ; Cn # +FB3F ; Cn # +FB42 ; Cn # +FB45 ; Cn # +FBC2..FBD2 ; Cn # [17] .. +FD40..FD4F ; Cn # [16] .. +FD90..FD91 ; Cn # [2] .. +FDC8..FDEF ; Cn # [40] .. +FDFE..FDFF ; Cn # [2] .. +FE1A..FE1F ; Cn # [6] .. +FE53 ; Cn # +FE67 ; Cn # +FE6C..FE6F ; Cn # [4] .. +FE75 ; Cn # +FEFD..FEFE ; Cn # [2] .. +FF00 ; Cn # +FFBF..FFC1 ; Cn # [3] .. +FFC8..FFC9 ; Cn # [2] .. +FFD0..FFD1 ; Cn # [2] .. +FFD8..FFD9 ; Cn # [2] .. +FFDD..FFDF ; Cn # [3] .. +FFE7 ; Cn # +FFEF..FFF8 ; Cn # [10] .. +FFFE..FFFF ; Cn # [2] .. +1000C ; Cn # +10027 ; Cn # +1003B ; Cn # +1003E ; Cn # +1004E..1004F ; Cn # [2] .. +1005E..1007F ; Cn # [34] .. +100FB..100FF ; Cn # [5] .. +10103..10106 ; Cn # [4] .. +10134..10136 ; Cn # [3] .. +1018F ; Cn # +1019C..1019F ; Cn # [4] .. +101A1..101CF ; Cn # [47] .. +101FE..1027F ; Cn # [130] .. +1029D..1029F ; Cn # [3] .. +102D1..102DF ; Cn # [15] .. +102FC..102FF ; Cn # [4] .. +10324..1032C ; Cn # [9] .. +1034B..1034F ; Cn # [5] .. +1037B..1037F ; Cn # [5] .. +1039E ; Cn # +103C4..103C7 ; Cn # [4] .. +103D6..103FF ; Cn # [42] .. +1049E..1049F ; Cn # [2] .. +104AA..104AF ; Cn # [6] .. +104D4..104D7 ; Cn # [4] .. +104FC..104FF ; Cn # [4] .. +10528..1052F ; Cn # [8] .. +10564..1056E ; Cn # [11] .. +10570..105FF ; Cn # [144] .. +10737..1073F ; Cn # [9] .. +10756..1075F ; Cn # [10] .. +10768..107FF ; Cn # [152] .. +10806..10807 ; Cn # [2] .. +10809 ; Cn # +10836 ; Cn # +10839..1083B ; Cn # [3] .. +1083D..1083E ; Cn # [2] .. +10856 ; Cn # +1089F..108A6 ; Cn # [8] .. +108B0..108DF ; Cn # [48] .. +108F3 ; Cn # +108F6..108FA ; Cn # [5] .. +1091C..1091E ; Cn # [3] .. +1093A..1093E ; Cn # [5] .. +10940..1097F ; Cn # [64] .. +109B8..109BB ; Cn # [4] .. +109D0..109D1 ; Cn # [2] .. +10A04 ; Cn # +10A07..10A0B ; Cn # [5] .. +10A14 ; Cn # +10A18 ; Cn # +10A36..10A37 ; Cn # [2] .. +10A3B..10A3E ; Cn # [4] .. +10A49..10A4F ; Cn # [7] .. +10A59..10A5F ; Cn # [7] .. +10AA0..10ABF ; Cn # [32] .. +10AE7..10AEA ; Cn # [4] .. +10AF7..10AFF ; Cn # [9] .. +10B36..10B38 ; Cn # [3] .. +10B56..10B57 ; Cn # [2] .. +10B73..10B77 ; Cn # [5] .. +10B92..10B98 ; Cn # [7] .. +10B9D..10BA8 ; Cn # [12] .. +10BB0..10BFF ; Cn # [80] .. +10C49..10C7F ; Cn # [55] .. +10CB3..10CBF ; Cn # [13] .. +10CF3..10CF9 ; Cn # [7] .. +10D28..10D2F ; Cn # [8] .. +10D3A..10E5F ; Cn # [294] .. +10E7F..10EFF ; Cn # [129] .. +10F28..10F2F ; Cn # [8] .. +10F5A..10FDF ; Cn # [134] .. +10FF7..10FFF ; Cn # [9] .. +1104E..11051 ; Cn # [4] .. +11070..1107E ; Cn # [15] .. +110C2..110CC ; Cn # [11] .. +110CE..110CF ; Cn # [2] .. +110E9..110EF ; Cn # [7] .. +110FA..110FF ; Cn # [6] .. +11135 ; Cn # +11147..1114F ; Cn # [9] .. +11177..1117F ; Cn # [9] .. +111CE..111CF ; Cn # [2] .. +111E0 ; Cn # +111F5..111FF ; Cn # [11] .. +11212 ; Cn # +1123F..1127F ; Cn # [65] .. +11287 ; Cn # +11289 ; Cn # +1128E ; Cn # +1129E ; Cn # +112AA..112AF ; Cn # [6] .. +112EB..112EF ; Cn # [5] .. +112FA..112FF ; Cn # [6] .. +11304 ; Cn # +1130D..1130E ; Cn # [2] .. +11311..11312 ; Cn # [2] .. +11329 ; Cn # +11331 ; Cn # +11334 ; Cn # +1133A ; Cn # +11345..11346 ; Cn # [2] .. +11349..1134A ; Cn # [2] .. +1134E..1134F ; Cn # [2] .. +11351..11356 ; Cn # [6] .. +11358..1135C ; Cn # [5] .. +11364..11365 ; Cn # [2] .. +1136D..1136F ; Cn # [3] .. +11375..113FF ; Cn # [139] .. +1145A ; Cn # +1145C ; Cn # +11460..1147F ; Cn # [32] .. +114C8..114CF ; Cn # [8] .. +114DA..1157F ; Cn # [166] .. +115B6..115B7 ; Cn # [2] .. +115DE..115FF ; Cn # [34] .. +11645..1164F ; Cn # [11] .. +1165A..1165F ; Cn # [6] .. +1166D..1167F ; Cn # [19] .. +116B9..116BF ; Cn # [7] .. +116CA..116FF ; Cn # [54] .. +1171B..1171C ; Cn # [2] .. +1172C..1172F ; Cn # [4] .. +11740..117FF ; Cn # [192] .. +1183C..1189F ; Cn # [100] .. +118F3..118FE ; Cn # [12] .. +11900..1199F ; Cn # [160] .. +119A8..119A9 ; Cn # [2] .. +119D8..119D9 ; Cn # [2] .. +119E5..119FF ; Cn # [27] .. +11A48..11A4F ; Cn # [8] .. +11AA3..11ABF ; Cn # [29] .. +11AF9..11BFF ; Cn # [263] .. +11C09 ; Cn # +11C37 ; Cn # +11C46..11C4F ; Cn # [10] .. +11C6D..11C6F ; Cn # [3] .. +11C90..11C91 ; Cn # [2] .. +11CA8 ; Cn # +11CB7..11CFF ; Cn # [73] .. +11D07 ; Cn # +11D0A ; Cn # +11D37..11D39 ; Cn # [3] .. +11D3B ; Cn # +11D3E ; Cn # +11D48..11D4F ; Cn # [8] .. +11D5A..11D5F ; Cn # [6] .. +11D66 ; Cn # +11D69 ; Cn # +11D8F ; Cn # +11D92 ; Cn # +11D99..11D9F ; Cn # [7] .. +11DAA..11EDF ; Cn # [310] .. +11EF9..11FBF ; Cn # [199] .. +11FF2..11FFE ; Cn # [13] .. +1239A..123FF ; Cn # [102] .. +1246F ; Cn # +12475..1247F ; Cn # [11] .. +12544..12FFF ; Cn # [2748] .. +1342F ; Cn # +13439..143FF ; Cn # [4039] .. +14647..167FF ; Cn # [8633] .. +16A39..16A3F ; Cn # [7] .. +16A5F ; Cn # +16A6A..16A6D ; Cn # [4] .. +16A70..16ACF ; Cn # [96] .. +16AEE..16AEF ; Cn # [2] .. +16AF6..16AFF ; Cn # [10] .. +16B46..16B4F ; Cn # [10] .. +16B5A ; Cn # +16B62 ; Cn # +16B78..16B7C ; Cn # [5] .. +16B90..16E3F ; Cn # [688] .. +16E9B..16EFF ; Cn # [101] .. +16F4B..16F4E ; Cn # [4] .. +16F88..16F8E ; Cn # [7] .. +16FA0..16FDF ; Cn # [64] .. +16FE4..16FFF ; Cn # [28] .. +187F8..187FF ; Cn # [8] .. +18AF3..1AFFF ; Cn # [9485] .. +1B11F..1B14F ; Cn # [49] .. +1B153..1B163 ; Cn # [17] .. +1B168..1B16F ; Cn # [8] .. +1B2FC..1BBFF ; Cn # [2308] .. +1BC6B..1BC6F ; Cn # [5] .. +1BC7D..1BC7F ; Cn # [3] .. +1BC89..1BC8F ; Cn # [7] .. +1BC9A..1BC9B ; Cn # [2] .. +1BCA4..1CFFF ; Cn # [4956] .. +1D0F6..1D0FF ; Cn # [10] .. +1D127..1D128 ; Cn # [2] .. +1D1E9..1D1FF ; Cn # [23] .. +1D246..1D2DF ; Cn # [154] .. +1D2F4..1D2FF ; Cn # [12] .. +1D357..1D35F ; Cn # [9] .. +1D379..1D3FF ; Cn # [135] .. +1D455 ; Cn # +1D49D ; Cn # +1D4A0..1D4A1 ; Cn # [2] .. +1D4A3..1D4A4 ; Cn # [2] .. +1D4A7..1D4A8 ; Cn # [2] .. +1D4AD ; Cn # +1D4BA ; Cn # +1D4BC ; Cn # +1D4C4 ; Cn # +1D506 ; Cn # +1D50B..1D50C ; Cn # [2] .. +1D515 ; Cn # +1D51D ; Cn # +1D53A ; Cn # +1D53F ; Cn # +1D545 ; Cn # +1D547..1D549 ; Cn # [3] .. +1D551 ; Cn # +1D6A6..1D6A7 ; Cn # [2] .. +1D7CC..1D7CD ; Cn # [2] .. +1DA8C..1DA9A ; Cn # [15] .. +1DAA0 ; Cn # +1DAB0..1DFFF ; Cn # [1360] .. +1E007 ; Cn # +1E019..1E01A ; Cn # [2] .. +1E022 ; Cn # +1E025 ; Cn # +1E02B..1E0FF ; Cn # [213] .. +1E12D..1E12F ; Cn # [3] .. +1E13E..1E13F ; Cn # [2] .. +1E14A..1E14D ; Cn # [4] .. +1E150..1E2BF ; Cn # [368] .. +1E2FA..1E2FE ; Cn # [5] .. +1E300..1E7FF ; Cn # [1280] .. +1E8C5..1E8C6 ; Cn # [2] .. +1E8D7..1E8FF ; Cn # [41] .. +1E94C..1E94F ; Cn # [4] .. +1E95A..1E95D ; Cn # [4] .. +1E960..1EC70 ; Cn # [785] .. +1ECB5..1ED00 ; Cn # [76] .. +1ED3E..1EDFF ; Cn # [194] .. +1EE04 ; Cn # +1EE20 ; Cn # +1EE23 ; Cn # +1EE25..1EE26 ; Cn # [2] .. +1EE28 ; Cn # +1EE33 ; Cn # +1EE38 ; Cn # +1EE3A ; Cn # +1EE3C..1EE41 ; Cn # [6] .. +1EE43..1EE46 ; Cn # [4] .. +1EE48 ; Cn # +1EE4A ; Cn # +1EE4C ; Cn # +1EE50 ; Cn # +1EE53 ; Cn # +1EE55..1EE56 ; Cn # [2] .. +1EE58 ; Cn # +1EE5A ; Cn # +1EE5C ; Cn # +1EE5E ; Cn # +1EE60 ; Cn # +1EE63 ; Cn # +1EE65..1EE66 ; Cn # [2] .. +1EE6B ; Cn # +1EE73 ; Cn # +1EE78 ; Cn # +1EE7D ; Cn # +1EE7F ; Cn # +1EE8A ; Cn # +1EE9C..1EEA0 ; Cn # [5] .. +1EEA4 ; Cn # +1EEAA ; Cn # +1EEBC..1EEEF ; Cn # [52] .. +1EEF2..1EFFF ; Cn # [270] .. +1F02C..1F02F ; Cn # [4] .. +1F094..1F09F ; Cn # [12] .. +1F0AF..1F0B0 ; Cn # [2] .. +1F0C0 ; Cn # +1F0D0 ; Cn # +1F0F6..1F0FF ; Cn # [10] .. +1F10D..1F10F ; Cn # [3] .. +1F16D..1F16F ; Cn # [3] .. +1F1AD..1F1E5 ; Cn # [57] .. +1F203..1F20F ; Cn # [13] .. +1F23C..1F23F ; Cn # [4] .. +1F249..1F24F ; Cn # [7] .. +1F252..1F25F ; Cn # [14] .. +1F266..1F2FF ; Cn # [154] .. +1F6D6..1F6DF ; Cn # [10] .. +1F6ED..1F6EF ; Cn # [3] .. +1F6FB..1F6FF ; Cn # [5] .. +1F774..1F77F ; Cn # [12] .. +1F7D9..1F7DF ; Cn # [7] .. +1F7EC..1F7FF ; Cn # [20] .. +1F80C..1F80F ; Cn # [4] .. +1F848..1F84F ; Cn # [8] .. +1F85A..1F85F ; Cn # [6] .. +1F888..1F88F ; Cn # [8] .. +1F8AE..1F8FF ; Cn # [82] .. +1F90C ; Cn # +1F972 ; Cn # +1F977..1F979 ; Cn # [3] .. +1F9A3..1F9A4 ; Cn # [2] .. +1F9AB..1F9AD ; Cn # [3] .. +1F9CB..1F9CC ; Cn # [2] .. +1FA54..1FA5F ; Cn # [12] .. +1FA6E..1FA6F ; Cn # [2] .. +1FA74..1FA77 ; Cn # [4] .. +1FA7B..1FA7F ; Cn # [5] .. +1FA83..1FA8F ; Cn # [13] .. +1FA96..1FFFF ; Cn # [1386] .. +2A6D7..2A6FF ; Cn # [41] .. +2B735..2B73F ; Cn # [11] .. +2B81E..2B81F ; Cn # [2] .. +2CEA2..2CEAF ; Cn # [14] .. +2EBE1..2F7FF ; Cn # [3103] .. +2FA1E..E0000 ; Cn # [722403] .. +E0002..E001F ; Cn # [30] .. +E0080..E00FF ; Cn # [128] .. +E01F0..EFFFF ; Cn # [65040] .. +FFFFE..FFFFF ; Cn # [2] .. +10FFFE..10FFFF; Cn # [2] .. + +# Total code points: 836602 + +# ================================================ + +# General_Category=Uppercase_Letter + +0041..005A ; Lu # [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z +00C0..00D6 ; Lu # [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS +00D8..00DE ; Lu # [7] LATIN CAPITAL LETTER O WITH STROKE..LATIN CAPITAL LETTER THORN +0100 ; Lu # LATIN CAPITAL LETTER A WITH MACRON +0102 ; Lu # LATIN CAPITAL LETTER A WITH BREVE +0104 ; Lu # LATIN CAPITAL LETTER A WITH OGONEK +0106 ; Lu # LATIN CAPITAL LETTER C WITH ACUTE +0108 ; Lu # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A ; Lu # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C ; Lu # LATIN CAPITAL LETTER C WITH CARON +010E ; Lu # LATIN CAPITAL LETTER D WITH CARON +0110 ; Lu # LATIN CAPITAL LETTER D WITH STROKE +0112 ; Lu # LATIN CAPITAL LETTER E WITH MACRON +0114 ; Lu # LATIN CAPITAL LETTER E WITH BREVE +0116 ; Lu # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118 ; Lu # LATIN CAPITAL LETTER E WITH OGONEK +011A ; Lu # LATIN CAPITAL LETTER E WITH CARON +011C ; Lu # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E ; Lu # LATIN CAPITAL LETTER G WITH BREVE +0120 ; Lu # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122 ; Lu # LATIN CAPITAL LETTER G WITH CEDILLA +0124 ; Lu # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126 ; Lu # LATIN CAPITAL LETTER H WITH STROKE +0128 ; Lu # LATIN CAPITAL LETTER I WITH TILDE +012A ; Lu # LATIN CAPITAL LETTER I WITH MACRON +012C ; Lu # LATIN CAPITAL LETTER I WITH BREVE +012E ; Lu # LATIN CAPITAL LETTER I WITH OGONEK +0130 ; Lu # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132 ; Lu # LATIN CAPITAL LIGATURE IJ +0134 ; Lu # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136 ; Lu # LATIN CAPITAL LETTER K WITH CEDILLA +0139 ; Lu # LATIN CAPITAL LETTER L WITH ACUTE +013B ; Lu # LATIN CAPITAL LETTER L WITH CEDILLA +013D ; Lu # LATIN CAPITAL LETTER L WITH CARON +013F ; Lu # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141 ; Lu # LATIN CAPITAL LETTER L WITH STROKE +0143 ; Lu # LATIN CAPITAL LETTER N WITH ACUTE +0145 ; Lu # LATIN CAPITAL LETTER N WITH CEDILLA +0147 ; Lu # LATIN CAPITAL LETTER N WITH CARON +014A ; Lu # LATIN CAPITAL LETTER ENG +014C ; Lu # LATIN CAPITAL LETTER O WITH MACRON +014E ; Lu # LATIN CAPITAL LETTER O WITH BREVE +0150 ; Lu # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152 ; Lu # LATIN CAPITAL LIGATURE OE +0154 ; Lu # LATIN CAPITAL LETTER R WITH ACUTE +0156 ; Lu # LATIN CAPITAL LETTER R WITH CEDILLA +0158 ; Lu # LATIN CAPITAL LETTER R WITH CARON +015A ; Lu # LATIN CAPITAL LETTER S WITH ACUTE +015C ; Lu # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E ; Lu # LATIN CAPITAL LETTER S WITH CEDILLA +0160 ; Lu # LATIN CAPITAL LETTER S WITH CARON +0162 ; Lu # LATIN CAPITAL LETTER T WITH CEDILLA +0164 ; Lu # LATIN CAPITAL LETTER T WITH CARON +0166 ; Lu # LATIN CAPITAL LETTER T WITH STROKE +0168 ; Lu # LATIN CAPITAL LETTER U WITH TILDE +016A ; Lu # LATIN CAPITAL LETTER U WITH MACRON +016C ; Lu # LATIN CAPITAL LETTER U WITH BREVE +016E ; Lu # LATIN CAPITAL LETTER U WITH RING ABOVE +0170 ; Lu # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172 ; Lu # LATIN CAPITAL LETTER U WITH OGONEK +0174 ; Lu # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176 ; Lu # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178..0179 ; Lu # [2] LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN CAPITAL LETTER Z WITH ACUTE +017B ; Lu # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D ; Lu # LATIN CAPITAL LETTER Z WITH CARON +0181..0182 ; Lu # [2] LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPITAL LETTER B WITH TOPBAR +0184 ; Lu # LATIN CAPITAL LETTER TONE SIX +0186..0187 ; Lu # [2] LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL LETTER C WITH HOOK +0189..018B ; Lu # [3] LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITAL LETTER D WITH TOPBAR +018E..0191 ; Lu # [4] LATIN CAPITAL LETTER REVERSED E..LATIN CAPITAL LETTER F WITH HOOK +0193..0194 ; Lu # [2] LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPITAL LETTER GAMMA +0196..0198 ; Lu # [3] LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LETTER K WITH HOOK +019C..019D ; Lu # [2] LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL LETTER N WITH LEFT HOOK +019F..01A0 ; Lu # [2] LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LATIN CAPITAL LETTER O WITH HORN +01A2 ; Lu # LATIN CAPITAL LETTER OI +01A4 ; Lu # LATIN CAPITAL LETTER P WITH HOOK +01A6..01A7 ; Lu # [2] LATIN LETTER YR..LATIN CAPITAL LETTER TONE TWO +01A9 ; Lu # LATIN CAPITAL LETTER ESH +01AC ; Lu # LATIN CAPITAL LETTER T WITH HOOK +01AE..01AF ; Lu # [2] LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..LATIN CAPITAL LETTER U WITH HORN +01B1..01B3 ; Lu # [3] LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL LETTER Y WITH HOOK +01B5 ; Lu # LATIN CAPITAL LETTER Z WITH STROKE +01B7..01B8 ; Lu # [2] LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETTER EZH REVERSED +01BC ; Lu # LATIN CAPITAL LETTER TONE FIVE +01C4 ; Lu # LATIN CAPITAL LETTER DZ WITH CARON +01C7 ; Lu # LATIN CAPITAL LETTER LJ +01CA ; Lu # LATIN CAPITAL LETTER NJ +01CD ; Lu # LATIN CAPITAL LETTER A WITH CARON +01CF ; Lu # LATIN CAPITAL LETTER I WITH CARON +01D1 ; Lu # LATIN CAPITAL LETTER O WITH CARON +01D3 ; Lu # LATIN CAPITAL LETTER U WITH CARON +01D5 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE ; Lu # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0 ; Lu # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2 ; Lu # LATIN CAPITAL LETTER AE WITH MACRON +01E4 ; Lu # LATIN CAPITAL LETTER G WITH STROKE +01E6 ; Lu # LATIN CAPITAL LETTER G WITH CARON +01E8 ; Lu # LATIN CAPITAL LETTER K WITH CARON +01EA ; Lu # LATIN CAPITAL LETTER O WITH OGONEK +01EC ; Lu # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE ; Lu # LATIN CAPITAL LETTER EZH WITH CARON +01F1 ; Lu # LATIN CAPITAL LETTER DZ +01F4 ; Lu # LATIN CAPITAL LETTER G WITH ACUTE +01F6..01F8 ; Lu # [3] LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LETTER N WITH GRAVE +01FA ; Lu # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC ; Lu # LATIN CAPITAL LETTER AE WITH ACUTE +01FE ; Lu # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200 ; Lu # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202 ; Lu # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204 ; Lu # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206 ; Lu # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208 ; Lu # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A ; Lu # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C ; Lu # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E ; Lu # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210 ; Lu # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212 ; Lu # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214 ; Lu # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216 ; Lu # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218 ; Lu # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A ; Lu # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C ; Lu # LATIN CAPITAL LETTER YOGH +021E ; Lu # LATIN CAPITAL LETTER H WITH CARON +0220 ; Lu # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222 ; Lu # LATIN CAPITAL LETTER OU +0224 ; Lu # LATIN CAPITAL LETTER Z WITH HOOK +0226 ; Lu # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228 ; Lu # LATIN CAPITAL LETTER E WITH CEDILLA +022A ; Lu # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E ; Lu # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230 ; Lu # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232 ; Lu # LATIN CAPITAL LETTER Y WITH MACRON +023A..023B ; Lu # [2] LATIN CAPITAL LETTER A WITH STROKE..LATIN CAPITAL LETTER C WITH STROKE +023D..023E ; Lu # [2] LATIN CAPITAL LETTER L WITH BAR..LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241 ; Lu # LATIN CAPITAL LETTER GLOTTAL STOP +0243..0246 ; Lu # [4] LATIN CAPITAL LETTER B WITH STROKE..LATIN CAPITAL LETTER E WITH STROKE +0248 ; Lu # LATIN CAPITAL LETTER J WITH STROKE +024A ; Lu # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C ; Lu # LATIN CAPITAL LETTER R WITH STROKE +024E ; Lu # LATIN CAPITAL LETTER Y WITH STROKE +0370 ; Lu # GREEK CAPITAL LETTER HETA +0372 ; Lu # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376 ; Lu # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F ; Lu # GREEK CAPITAL LETTER YOT +0386 ; Lu # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388..038A ; Lu # [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS +038C ; Lu # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E..038F ; Lu # [2] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER OMEGA WITH TONOS +0391..03A1 ; Lu # [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO +03A3..03AB ; Lu # [9] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03CF ; Lu # GREEK CAPITAL KAI SYMBOL +03D2..03D4 ; Lu # [3] GREEK UPSILON WITH HOOK SYMBOL..GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL +03D8 ; Lu # GREEK LETTER ARCHAIC KOPPA +03DA ; Lu # GREEK LETTER STIGMA +03DC ; Lu # GREEK LETTER DIGAMMA +03DE ; Lu # GREEK LETTER KOPPA +03E0 ; Lu # GREEK LETTER SAMPI +03E2 ; Lu # COPTIC CAPITAL LETTER SHEI +03E4 ; Lu # COPTIC CAPITAL LETTER FEI +03E6 ; Lu # COPTIC CAPITAL LETTER KHEI +03E8 ; Lu # COPTIC CAPITAL LETTER HORI +03EA ; Lu # COPTIC CAPITAL LETTER GANGIA +03EC ; Lu # COPTIC CAPITAL LETTER SHIMA +03EE ; Lu # COPTIC CAPITAL LETTER DEI +03F4 ; Lu # GREEK CAPITAL THETA SYMBOL +03F7 ; Lu # GREEK CAPITAL LETTER SHO +03F9..03FA ; Lu # [2] GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAPITAL LETTER SAN +03FD..042F ; Lu # [51] GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..CYRILLIC CAPITAL LETTER YA +0460 ; Lu # CYRILLIC CAPITAL LETTER OMEGA +0462 ; Lu # CYRILLIC CAPITAL LETTER YAT +0464 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED E +0466 ; Lu # CYRILLIC CAPITAL LETTER LITTLE YUS +0468 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A ; Lu # CYRILLIC CAPITAL LETTER BIG YUS +046C ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E ; Lu # CYRILLIC CAPITAL LETTER KSI +0470 ; Lu # CYRILLIC CAPITAL LETTER PSI +0472 ; Lu # CYRILLIC CAPITAL LETTER FITA +0474 ; Lu # CYRILLIC CAPITAL LETTER IZHITSA +0476 ; Lu # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478 ; Lu # CYRILLIC CAPITAL LETTER UK +047A ; Lu # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C ; Lu # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E ; Lu # CYRILLIC CAPITAL LETTER OT +0480 ; Lu # CYRILLIC CAPITAL LETTER KOPPA +048A ; Lu # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C ; Lu # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E ; Lu # CYRILLIC CAPITAL LETTER ER WITH TICK +0490 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496 ; Lu # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498 ; Lu # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A ; Lu # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C ; Lu # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E ; Lu # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0 ; Lu # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2 ; Lu # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4 ; Lu # CYRILLIC CAPITAL LIGATURE EN GHE +04A6 ; Lu # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8 ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA ; Lu # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC ; Lu # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE ; Lu # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0 ; Lu # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2 ; Lu # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4 ; Lu # CYRILLIC CAPITAL LIGATURE TE TSE +04B6 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA ; Lu # CYRILLIC CAPITAL LETTER SHHA +04BC ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0..04C1 ; Lu # [2] CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3 ; Lu # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5 ; Lu # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7 ; Lu # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9 ; Lu # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB ; Lu # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD ; Lu # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0 ; Lu # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2 ; Lu # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4 ; Lu # CYRILLIC CAPITAL LIGATURE A IE +04D6 ; Lu # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8 ; Lu # CYRILLIC CAPITAL LETTER SCHWA +04DA ; Lu # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC ; Lu # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE ; Lu # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0 ; Lu # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2 ; Lu # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4 ; Lu # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6 ; Lu # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8 ; Lu # CYRILLIC CAPITAL LETTER BARRED O +04EA ; Lu # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC ; Lu # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE ; Lu # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0 ; Lu # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2 ; Lu # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4 ; Lu # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6 ; Lu # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8 ; Lu # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA ; Lu # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC ; Lu # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE ; Lu # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500 ; Lu # CYRILLIC CAPITAL LETTER KOMI DE +0502 ; Lu # CYRILLIC CAPITAL LETTER KOMI DJE +0504 ; Lu # CYRILLIC CAPITAL LETTER KOMI ZJE +0506 ; Lu # CYRILLIC CAPITAL LETTER KOMI DZJE +0508 ; Lu # CYRILLIC CAPITAL LETTER KOMI LJE +050A ; Lu # CYRILLIC CAPITAL LETTER KOMI NJE +050C ; Lu # CYRILLIC CAPITAL LETTER KOMI SJE +050E ; Lu # CYRILLIC CAPITAL LETTER KOMI TJE +0510 ; Lu # CYRILLIC CAPITAL LETTER REVERSED ZE +0512 ; Lu # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514 ; Lu # CYRILLIC CAPITAL LETTER LHA +0516 ; Lu # CYRILLIC CAPITAL LETTER RHA +0518 ; Lu # CYRILLIC CAPITAL LETTER YAE +051A ; Lu # CYRILLIC CAPITAL LETTER QA +051C ; Lu # CYRILLIC CAPITAL LETTER WE +051E ; Lu # CYRILLIC CAPITAL LETTER ALEUT KA +0520 ; Lu # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522 ; Lu # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524 ; Lu # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526 ; Lu # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528 ; Lu # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A ; Lu # CYRILLIC CAPITAL LETTER DZZHE +052C ; Lu # CYRILLIC CAPITAL LETTER DCHE +052E ; Lu # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531..0556 ; Lu # [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH +10A0..10C5 ; Lu # [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE +10C7 ; Lu # GEORGIAN CAPITAL LETTER YN +10CD ; Lu # GEORGIAN CAPITAL LETTER AEN +13A0..13F5 ; Lu # [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +1C90..1CBA ; Lu # [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD..1CBF ; Lu # [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00 ; Lu # LATIN CAPITAL LETTER A WITH RING BELOW +1E02 ; Lu # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04 ; Lu # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06 ; Lu # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08 ; Lu # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A ; Lu # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C ; Lu # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E ; Lu # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10 ; Lu # LATIN CAPITAL LETTER D WITH CEDILLA +1E12 ; Lu # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14 ; Lu # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16 ; Lu # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A ; Lu # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C ; Lu # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E ; Lu # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20 ; Lu # LATIN CAPITAL LETTER G WITH MACRON +1E22 ; Lu # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24 ; Lu # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26 ; Lu # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28 ; Lu # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A ; Lu # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C ; Lu # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E ; Lu # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30 ; Lu # LATIN CAPITAL LETTER K WITH ACUTE +1E32 ; Lu # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34 ; Lu # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36 ; Lu # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38 ; Lu # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A ; Lu # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C ; Lu # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E ; Lu # LATIN CAPITAL LETTER M WITH ACUTE +1E40 ; Lu # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42 ; Lu # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44 ; Lu # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46 ; Lu # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48 ; Lu # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A ; Lu # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E ; Lu # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50 ; Lu # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52 ; Lu # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54 ; Lu # LATIN CAPITAL LETTER P WITH ACUTE +1E56 ; Lu # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58 ; Lu # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A ; Lu # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C ; Lu # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E ; Lu # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60 ; Lu # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62 ; Lu # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64 ; Lu # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66 ; Lu # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68 ; Lu # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A ; Lu # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C ; Lu # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E ; Lu # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70 ; Lu # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72 ; Lu # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74 ; Lu # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76 ; Lu # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78 ; Lu # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A ; Lu # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C ; Lu # LATIN CAPITAL LETTER V WITH TILDE +1E7E ; Lu # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80 ; Lu # LATIN CAPITAL LETTER W WITH GRAVE +1E82 ; Lu # LATIN CAPITAL LETTER W WITH ACUTE +1E84 ; Lu # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86 ; Lu # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88 ; Lu # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A ; Lu # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C ; Lu # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E ; Lu # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90 ; Lu # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92 ; Lu # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94 ; Lu # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E9E ; Lu # LATIN CAPITAL LETTER SHARP S +1EA0 ; Lu # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2 ; Lu # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8 ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC ; Lu # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6 ; Lu # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8 ; Lu # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA ; Lu # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC ; Lu # LATIN CAPITAL LETTER E WITH TILDE +1EBE ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6 ; Lu # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8 ; Lu # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA ; Lu # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC ; Lu # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE ; Lu # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8 ; Lu # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA ; Lu # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC ; Lu # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE ; Lu # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0 ; Lu # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2 ; Lu # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4 ; Lu # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6 ; Lu # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8 ; Lu # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA ; Lu # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC ; Lu # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE ; Lu # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0 ; Lu # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2 ; Lu # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4 ; Lu # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6 ; Lu # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8 ; Lu # LATIN CAPITAL LETTER Y WITH TILDE +1EFA ; Lu # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC ; Lu # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE ; Lu # LATIN CAPITAL LETTER Y WITH LOOP +1F08..1F0F ; Lu # [8] GREEK CAPITAL LETTER ALPHA WITH PSILI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18..1F1D ; Lu # [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28..1F2F ; Lu # [8] GREEK CAPITAL LETTER ETA WITH PSILI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38..1F3F ; Lu # [8] GREEK CAPITAL LETTER IOTA WITH PSILI..GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48..1F4D ; Lu # [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F59 ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F ; Lu # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68..1F6F ; Lu # [8] GREEK CAPITAL LETTER OMEGA WITH PSILI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1FB8..1FBB ; Lu # [4] GREEK CAPITAL LETTER ALPHA WITH VRACHY..GREEK CAPITAL LETTER ALPHA WITH OXIA +1FC8..1FCB ; Lu # [4] GREEK CAPITAL LETTER EPSILON WITH VARIA..GREEK CAPITAL LETTER ETA WITH OXIA +1FD8..1FDB ; Lu # [4] GREEK CAPITAL LETTER IOTA WITH VRACHY..GREEK CAPITAL LETTER IOTA WITH OXIA +1FE8..1FEC ; Lu # [5] GREEK CAPITAL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA +1FF8..1FFB ; Lu # [4] GREEK CAPITAL LETTER OMICRON WITH VARIA..GREEK CAPITAL LETTER OMEGA WITH OXIA +2102 ; Lu # DOUBLE-STRUCK CAPITAL C +2107 ; Lu # EULER CONSTANT +210B..210D ; Lu # [3] SCRIPT CAPITAL H..DOUBLE-STRUCK CAPITAL H +2110..2112 ; Lu # [3] SCRIPT CAPITAL I..SCRIPT CAPITAL L +2115 ; Lu # DOUBLE-STRUCK CAPITAL N +2119..211D ; Lu # [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R +2124 ; Lu # DOUBLE-STRUCK CAPITAL Z +2126 ; Lu # OHM SIGN +2128 ; Lu # BLACK-LETTER CAPITAL Z +212A..212D ; Lu # [4] KELVIN SIGN..BLACK-LETTER CAPITAL C +2130..2133 ; Lu # [4] SCRIPT CAPITAL E..SCRIPT CAPITAL M +213E..213F ; Lu # [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI +2145 ; Lu # DOUBLE-STRUCK ITALIC CAPITAL D +2183 ; Lu # ROMAN NUMERAL REVERSED ONE HUNDRED +2C00..2C2E ; Lu # [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C60 ; Lu # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62..2C64 ; Lu # [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL +2C67 ; Lu # LATIN CAPITAL LETTER H WITH DESCENDER +2C69 ; Lu # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B ; Lu # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D..2C70 ; Lu # [4] LATIN CAPITAL LETTER ALPHA..LATIN CAPITAL LETTER TURNED ALPHA +2C72 ; Lu # LATIN CAPITAL LETTER W WITH HOOK +2C75 ; Lu # LATIN CAPITAL LETTER HALF H +2C7E..2C80 ; Lu # [3] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC CAPITAL LETTER ALFA +2C82 ; Lu # COPTIC CAPITAL LETTER VIDA +2C84 ; Lu # COPTIC CAPITAL LETTER GAMMA +2C86 ; Lu # COPTIC CAPITAL LETTER DALDA +2C88 ; Lu # COPTIC CAPITAL LETTER EIE +2C8A ; Lu # COPTIC CAPITAL LETTER SOU +2C8C ; Lu # COPTIC CAPITAL LETTER ZATA +2C8E ; Lu # COPTIC CAPITAL LETTER HATE +2C90 ; Lu # COPTIC CAPITAL LETTER THETHE +2C92 ; Lu # COPTIC CAPITAL LETTER IAUDA +2C94 ; Lu # COPTIC CAPITAL LETTER KAPA +2C96 ; Lu # COPTIC CAPITAL LETTER LAULA +2C98 ; Lu # COPTIC CAPITAL LETTER MI +2C9A ; Lu # COPTIC CAPITAL LETTER NI +2C9C ; Lu # COPTIC CAPITAL LETTER KSI +2C9E ; Lu # COPTIC CAPITAL LETTER O +2CA0 ; Lu # COPTIC CAPITAL LETTER PI +2CA2 ; Lu # COPTIC CAPITAL LETTER RO +2CA4 ; Lu # COPTIC CAPITAL LETTER SIMA +2CA6 ; Lu # COPTIC CAPITAL LETTER TAU +2CA8 ; Lu # COPTIC CAPITAL LETTER UA +2CAA ; Lu # COPTIC CAPITAL LETTER FI +2CAC ; Lu # COPTIC CAPITAL LETTER KHI +2CAE ; Lu # COPTIC CAPITAL LETTER PSI +2CB0 ; Lu # COPTIC CAPITAL LETTER OOU +2CB2 ; Lu # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6 ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8 ; Lu # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA ; Lu # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE ; Lu # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0 ; Lu # COPTIC CAPITAL LETTER SAMPI +2CC2 ; Lu # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8 ; Lu # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA ; Lu # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0 ; Lu # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8 ; Lu # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA ; Lu # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0 ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2 ; Lu # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED ; Lu # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2 ; Lu # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640 ; Lu # CYRILLIC CAPITAL LETTER ZEMLYA +A642 ; Lu # CYRILLIC CAPITAL LETTER DZELO +A644 ; Lu # CYRILLIC CAPITAL LETTER REVERSED DZE +A646 ; Lu # CYRILLIC CAPITAL LETTER IOTA +A648 ; Lu # CYRILLIC CAPITAL LETTER DJERV +A64A ; Lu # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C ; Lu # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E ; Lu # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650 ; Lu # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654 ; Lu # CYRILLIC CAPITAL LETTER REVERSED YU +A656 ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED A +A658 ; Lu # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A ; Lu # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C ; Lu # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E ; Lu # CYRILLIC CAPITAL LETTER YN +A660 ; Lu # CYRILLIC CAPITAL LETTER REVERSED TSE +A662 ; Lu # CYRILLIC CAPITAL LETTER SOFT DE +A664 ; Lu # CYRILLIC CAPITAL LETTER SOFT EL +A666 ; Lu # CYRILLIC CAPITAL LETTER SOFT EM +A668 ; Lu # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A ; Lu # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C ; Lu # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680 ; Lu # CYRILLIC CAPITAL LETTER DWE +A682 ; Lu # CYRILLIC CAPITAL LETTER DZWE +A684 ; Lu # CYRILLIC CAPITAL LETTER ZHWE +A686 ; Lu # CYRILLIC CAPITAL LETTER CCHE +A688 ; Lu # CYRILLIC CAPITAL LETTER DZZE +A68A ; Lu # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C ; Lu # CYRILLIC CAPITAL LETTER TWE +A68E ; Lu # CYRILLIC CAPITAL LETTER TSWE +A690 ; Lu # CYRILLIC CAPITAL LETTER TSSE +A692 ; Lu # CYRILLIC CAPITAL LETTER TCHE +A694 ; Lu # CYRILLIC CAPITAL LETTER HWE +A696 ; Lu # CYRILLIC CAPITAL LETTER SHWE +A698 ; Lu # CYRILLIC CAPITAL LETTER DOUBLE O +A69A ; Lu # CYRILLIC CAPITAL LETTER CROSSED O +A722 ; Lu # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724 ; Lu # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726 ; Lu # LATIN CAPITAL LETTER HENG +A728 ; Lu # LATIN CAPITAL LETTER TZ +A72A ; Lu # LATIN CAPITAL LETTER TRESILLO +A72C ; Lu # LATIN CAPITAL LETTER CUATRILLO +A72E ; Lu # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732 ; Lu # LATIN CAPITAL LETTER AA +A734 ; Lu # LATIN CAPITAL LETTER AO +A736 ; Lu # LATIN CAPITAL LETTER AU +A738 ; Lu # LATIN CAPITAL LETTER AV +A73A ; Lu # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C ; Lu # LATIN CAPITAL LETTER AY +A73E ; Lu # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740 ; Lu # LATIN CAPITAL LETTER K WITH STROKE +A742 ; Lu # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744 ; Lu # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746 ; Lu # LATIN CAPITAL LETTER BROKEN L +A748 ; Lu # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A ; Lu # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C ; Lu # LATIN CAPITAL LETTER O WITH LOOP +A74E ; Lu # LATIN CAPITAL LETTER OO +A750 ; Lu # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752 ; Lu # LATIN CAPITAL LETTER P WITH FLOURISH +A754 ; Lu # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756 ; Lu # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758 ; Lu # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A ; Lu # LATIN CAPITAL LETTER R ROTUNDA +A75C ; Lu # LATIN CAPITAL LETTER RUM ROTUNDA +A75E ; Lu # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760 ; Lu # LATIN CAPITAL LETTER VY +A762 ; Lu # LATIN CAPITAL LETTER VISIGOTHIC Z +A764 ; Lu # LATIN CAPITAL LETTER THORN WITH STROKE +A766 ; Lu # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768 ; Lu # LATIN CAPITAL LETTER VEND +A76A ; Lu # LATIN CAPITAL LETTER ET +A76C ; Lu # LATIN CAPITAL LETTER IS +A76E ; Lu # LATIN CAPITAL LETTER CON +A779 ; Lu # LATIN CAPITAL LETTER INSULAR D +A77B ; Lu # LATIN CAPITAL LETTER INSULAR F +A77D..A77E ; Lu # [2] LATIN CAPITAL LETTER INSULAR G..LATIN CAPITAL LETTER TURNED INSULAR G +A780 ; Lu # LATIN CAPITAL LETTER TURNED L +A782 ; Lu # LATIN CAPITAL LETTER INSULAR R +A784 ; Lu # LATIN CAPITAL LETTER INSULAR S +A786 ; Lu # LATIN CAPITAL LETTER INSULAR T +A78B ; Lu # LATIN CAPITAL LETTER SALTILLO +A78D ; Lu # LATIN CAPITAL LETTER TURNED H +A790 ; Lu # LATIN CAPITAL LETTER N WITH DESCENDER +A792 ; Lu # LATIN CAPITAL LETTER C WITH BAR +A796 ; Lu # LATIN CAPITAL LETTER B WITH FLOURISH +A798 ; Lu # LATIN CAPITAL LETTER F WITH STROKE +A79A ; Lu # LATIN CAPITAL LETTER VOLAPUK AE +A79C ; Lu # LATIN CAPITAL LETTER VOLAPUK OE +A79E ; Lu # LATIN CAPITAL LETTER VOLAPUK UE +A7A0 ; Lu # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2 ; Lu # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4 ; Lu # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6 ; Lu # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8 ; Lu # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA..A7AE ; Lu # [5] LATIN CAPITAL LETTER H WITH HOOK..LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0..A7B4 ; Lu # [5] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER BETA +A7B6 ; Lu # LATIN CAPITAL LETTER OMEGA +A7B8 ; Lu # LATIN CAPITAL LETTER U WITH STROKE +A7BA ; Lu # LATIN CAPITAL LETTER GLOTTAL A +A7BC ; Lu # LATIN CAPITAL LETTER GLOTTAL I +A7BE ; Lu # LATIN CAPITAL LETTER GLOTTAL U +A7C2 ; Lu # LATIN CAPITAL LETTER ANGLICANA W +A7C4..A7C6 ; Lu # [3] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER Z WITH PALATAL HOOK +FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z +10400..10427 ; Lu # [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW +104B0..104D3 ; Lu # [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA +10C80..10CB2 ; Lu # [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US +118A0..118BF ; Lu # [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO +16E40..16E5F ; Lu # [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y +1D400..1D419 ; Lu # [26] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL BOLD CAPITAL Z +1D434..1D44D ; Lu # [26] MATHEMATICAL ITALIC CAPITAL A..MATHEMATICAL ITALIC CAPITAL Z +1D468..1D481 ; Lu # [26] MATHEMATICAL BOLD ITALIC CAPITAL A..MATHEMATICAL BOLD ITALIC CAPITAL Z +1D49C ; Lu # MATHEMATICAL SCRIPT CAPITAL A +1D49E..1D49F ; Lu # [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D +1D4A2 ; Lu # MATHEMATICAL SCRIPT CAPITAL G +1D4A5..1D4A6 ; Lu # [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K +1D4A9..1D4AC ; Lu # [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q +1D4AE..1D4B5 ; Lu # [8] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT CAPITAL Z +1D4D0..1D4E9 ; Lu # [26] MATHEMATICAL BOLD SCRIPT CAPITAL A..MATHEMATICAL BOLD SCRIPT CAPITAL Z +1D504..1D505 ; Lu # [2] MATHEMATICAL FRAKTUR CAPITAL A..MATHEMATICAL FRAKTUR CAPITAL B +1D507..1D50A ; Lu # [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G +1D50D..1D514 ; Lu # [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q +1D516..1D51C ; Lu # [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y +1D538..1D539 ; Lu # [2] MATHEMATICAL DOUBLE-STRUCK CAPITAL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B +1D53B..1D53E ; Lu # [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G +1D540..1D544 ; Lu # [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M +1D546 ; Lu # MATHEMATICAL DOUBLE-STRUCK CAPITAL O +1D54A..1D550 ; Lu # [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y +1D56C..1D585 ; Lu # [26] MATHEMATICAL BOLD FRAKTUR CAPITAL A..MATHEMATICAL BOLD FRAKTUR CAPITAL Z +1D5A0..1D5B9 ; Lu # [26] MATHEMATICAL SANS-SERIF CAPITAL A..MATHEMATICAL SANS-SERIF CAPITAL Z +1D5D4..1D5ED ; Lu # [26] MATHEMATICAL SANS-SERIF BOLD CAPITAL A..MATHEMATICAL SANS-SERIF BOLD CAPITAL Z +1D608..1D621 ; Lu # [26] MATHEMATICAL SANS-SERIF ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF ITALIC CAPITAL Z +1D63C..1D655 ; Lu # [26] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL Z +1D670..1D689 ; Lu # [26] MATHEMATICAL MONOSPACE CAPITAL A..MATHEMATICAL MONOSPACE CAPITAL Z +1D6A8..1D6C0 ; Lu # [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA +1D6E2..1D6FA ; Lu # [25] MATHEMATICAL ITALIC CAPITAL ALPHA..MATHEMATICAL ITALIC CAPITAL OMEGA +1D71C..1D734 ; Lu # [25] MATHEMATICAL BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA +1D756..1D76E ; Lu # [25] MATHEMATICAL SANS-SERIF BOLD CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA +1D790..1D7A8 ; Lu # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA +1D7CA ; Lu # MATHEMATICAL BOLD CAPITAL DIGAMMA +1E900..1E921 ; Lu # [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA + +# Total code points: 1788 + +# ================================================ + +# General_Category=Lowercase_Letter + +0061..007A ; Ll # [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z +00B5 ; Ll # MICRO SIGN +00DF..00F6 ; Ll # [24] LATIN SMALL LETTER SHARP S..LATIN SMALL LETTER O WITH DIAERESIS +00F8..00FF ; Ll # [8] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER Y WITH DIAERESIS +0101 ; Ll # LATIN SMALL LETTER A WITH MACRON +0103 ; Ll # LATIN SMALL LETTER A WITH BREVE +0105 ; Ll # LATIN SMALL LETTER A WITH OGONEK +0107 ; Ll # LATIN SMALL LETTER C WITH ACUTE +0109 ; Ll # LATIN SMALL LETTER C WITH CIRCUMFLEX +010B ; Ll # LATIN SMALL LETTER C WITH DOT ABOVE +010D ; Ll # LATIN SMALL LETTER C WITH CARON +010F ; Ll # LATIN SMALL LETTER D WITH CARON +0111 ; Ll # LATIN SMALL LETTER D WITH STROKE +0113 ; Ll # LATIN SMALL LETTER E WITH MACRON +0115 ; Ll # LATIN SMALL LETTER E WITH BREVE +0117 ; Ll # LATIN SMALL LETTER E WITH DOT ABOVE +0119 ; Ll # LATIN SMALL LETTER E WITH OGONEK +011B ; Ll # LATIN SMALL LETTER E WITH CARON +011D ; Ll # LATIN SMALL LETTER G WITH CIRCUMFLEX +011F ; Ll # LATIN SMALL LETTER G WITH BREVE +0121 ; Ll # LATIN SMALL LETTER G WITH DOT ABOVE +0123 ; Ll # LATIN SMALL LETTER G WITH CEDILLA +0125 ; Ll # LATIN SMALL LETTER H WITH CIRCUMFLEX +0127 ; Ll # LATIN SMALL LETTER H WITH STROKE +0129 ; Ll # LATIN SMALL LETTER I WITH TILDE +012B ; Ll # LATIN SMALL LETTER I WITH MACRON +012D ; Ll # LATIN SMALL LETTER I WITH BREVE +012F ; Ll # LATIN SMALL LETTER I WITH OGONEK +0131 ; Ll # LATIN SMALL LETTER DOTLESS I +0133 ; Ll # LATIN SMALL LIGATURE IJ +0135 ; Ll # LATIN SMALL LETTER J WITH CIRCUMFLEX +0137..0138 ; Ll # [2] LATIN SMALL LETTER K WITH CEDILLA..LATIN SMALL LETTER KRA +013A ; Ll # LATIN SMALL LETTER L WITH ACUTE +013C ; Ll # LATIN SMALL LETTER L WITH CEDILLA +013E ; Ll # LATIN SMALL LETTER L WITH CARON +0140 ; Ll # LATIN SMALL LETTER L WITH MIDDLE DOT +0142 ; Ll # LATIN SMALL LETTER L WITH STROKE +0144 ; Ll # LATIN SMALL LETTER N WITH ACUTE +0146 ; Ll # LATIN SMALL LETTER N WITH CEDILLA +0148..0149 ; Ll # [2] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014B ; Ll # LATIN SMALL LETTER ENG +014D ; Ll # LATIN SMALL LETTER O WITH MACRON +014F ; Ll # LATIN SMALL LETTER O WITH BREVE +0151 ; Ll # LATIN SMALL LETTER O WITH DOUBLE ACUTE +0153 ; Ll # LATIN SMALL LIGATURE OE +0155 ; Ll # LATIN SMALL LETTER R WITH ACUTE +0157 ; Ll # LATIN SMALL LETTER R WITH CEDILLA +0159 ; Ll # LATIN SMALL LETTER R WITH CARON +015B ; Ll # LATIN SMALL LETTER S WITH ACUTE +015D ; Ll # LATIN SMALL LETTER S WITH CIRCUMFLEX +015F ; Ll # LATIN SMALL LETTER S WITH CEDILLA +0161 ; Ll # LATIN SMALL LETTER S WITH CARON +0163 ; Ll # LATIN SMALL LETTER T WITH CEDILLA +0165 ; Ll # LATIN SMALL LETTER T WITH CARON +0167 ; Ll # LATIN SMALL LETTER T WITH STROKE +0169 ; Ll # LATIN SMALL LETTER U WITH TILDE +016B ; Ll # LATIN SMALL LETTER U WITH MACRON +016D ; Ll # LATIN SMALL LETTER U WITH BREVE +016F ; Ll # LATIN SMALL LETTER U WITH RING ABOVE +0171 ; Ll # LATIN SMALL LETTER U WITH DOUBLE ACUTE +0173 ; Ll # LATIN SMALL LETTER U WITH OGONEK +0175 ; Ll # LATIN SMALL LETTER W WITH CIRCUMFLEX +0177 ; Ll # LATIN SMALL LETTER Y WITH CIRCUMFLEX +017A ; Ll # LATIN SMALL LETTER Z WITH ACUTE +017C ; Ll # LATIN SMALL LETTER Z WITH DOT ABOVE +017E..0180 ; Ll # [3] LATIN SMALL LETTER Z WITH CARON..LATIN SMALL LETTER B WITH STROKE +0183 ; Ll # LATIN SMALL LETTER B WITH TOPBAR +0185 ; Ll # LATIN SMALL LETTER TONE SIX +0188 ; Ll # LATIN SMALL LETTER C WITH HOOK +018C..018D ; Ll # [2] LATIN SMALL LETTER D WITH TOPBAR..LATIN SMALL LETTER TURNED DELTA +0192 ; Ll # LATIN SMALL LETTER F WITH HOOK +0195 ; Ll # LATIN SMALL LETTER HV +0199..019B ; Ll # [3] LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE +019E ; Ll # LATIN SMALL LETTER N WITH LONG RIGHT LEG +01A1 ; Ll # LATIN SMALL LETTER O WITH HORN +01A3 ; Ll # LATIN SMALL LETTER OI +01A5 ; Ll # LATIN SMALL LETTER P WITH HOOK +01A8 ; Ll # LATIN SMALL LETTER TONE TWO +01AA..01AB ; Ll # [2] LATIN LETTER REVERSED ESH LOOP..LATIN SMALL LETTER T WITH PALATAL HOOK +01AD ; Ll # LATIN SMALL LETTER T WITH HOOK +01B0 ; Ll # LATIN SMALL LETTER U WITH HORN +01B4 ; Ll # LATIN SMALL LETTER Y WITH HOOK +01B6 ; Ll # LATIN SMALL LETTER Z WITH STROKE +01B9..01BA ; Ll # [2] LATIN SMALL LETTER EZH REVERSED..LATIN SMALL LETTER EZH WITH TAIL +01BD..01BF ; Ll # [3] LATIN SMALL LETTER TONE FIVE..LATIN LETTER WYNN +01C6 ; Ll # LATIN SMALL LETTER DZ WITH CARON +01C9 ; Ll # LATIN SMALL LETTER LJ +01CC ; Ll # LATIN SMALL LETTER NJ +01CE ; Ll # LATIN SMALL LETTER A WITH CARON +01D0 ; Ll # LATIN SMALL LETTER I WITH CARON +01D2 ; Ll # LATIN SMALL LETTER O WITH CARON +01D4 ; Ll # LATIN SMALL LETTER U WITH CARON +01D6 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON +01D8 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE +01DA ; Ll # LATIN SMALL LETTER U WITH DIAERESIS AND CARON +01DC..01DD ; Ll # [2] LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E +01DF ; Ll # LATIN SMALL LETTER A WITH DIAERESIS AND MACRON +01E1 ; Ll # LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON +01E3 ; Ll # LATIN SMALL LETTER AE WITH MACRON +01E5 ; Ll # LATIN SMALL LETTER G WITH STROKE +01E7 ; Ll # LATIN SMALL LETTER G WITH CARON +01E9 ; Ll # LATIN SMALL LETTER K WITH CARON +01EB ; Ll # LATIN SMALL LETTER O WITH OGONEK +01ED ; Ll # LATIN SMALL LETTER O WITH OGONEK AND MACRON +01EF..01F0 ; Ll # [2] LATIN SMALL LETTER EZH WITH CARON..LATIN SMALL LETTER J WITH CARON +01F3 ; Ll # LATIN SMALL LETTER DZ +01F5 ; Ll # LATIN SMALL LETTER G WITH ACUTE +01F9 ; Ll # LATIN SMALL LETTER N WITH GRAVE +01FB ; Ll # LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE +01FD ; Ll # LATIN SMALL LETTER AE WITH ACUTE +01FF ; Ll # LATIN SMALL LETTER O WITH STROKE AND ACUTE +0201 ; Ll # LATIN SMALL LETTER A WITH DOUBLE GRAVE +0203 ; Ll # LATIN SMALL LETTER A WITH INVERTED BREVE +0205 ; Ll # LATIN SMALL LETTER E WITH DOUBLE GRAVE +0207 ; Ll # LATIN SMALL LETTER E WITH INVERTED BREVE +0209 ; Ll # LATIN SMALL LETTER I WITH DOUBLE GRAVE +020B ; Ll # LATIN SMALL LETTER I WITH INVERTED BREVE +020D ; Ll # LATIN SMALL LETTER O WITH DOUBLE GRAVE +020F ; Ll # LATIN SMALL LETTER O WITH INVERTED BREVE +0211 ; Ll # LATIN SMALL LETTER R WITH DOUBLE GRAVE +0213 ; Ll # LATIN SMALL LETTER R WITH INVERTED BREVE +0215 ; Ll # LATIN SMALL LETTER U WITH DOUBLE GRAVE +0217 ; Ll # LATIN SMALL LETTER U WITH INVERTED BREVE +0219 ; Ll # LATIN SMALL LETTER S WITH COMMA BELOW +021B ; Ll # LATIN SMALL LETTER T WITH COMMA BELOW +021D ; Ll # LATIN SMALL LETTER YOGH +021F ; Ll # LATIN SMALL LETTER H WITH CARON +0221 ; Ll # LATIN SMALL LETTER D WITH CURL +0223 ; Ll # LATIN SMALL LETTER OU +0225 ; Ll # LATIN SMALL LETTER Z WITH HOOK +0227 ; Ll # LATIN SMALL LETTER A WITH DOT ABOVE +0229 ; Ll # LATIN SMALL LETTER E WITH CEDILLA +022B ; Ll # LATIN SMALL LETTER O WITH DIAERESIS AND MACRON +022D ; Ll # LATIN SMALL LETTER O WITH TILDE AND MACRON +022F ; Ll # LATIN SMALL LETTER O WITH DOT ABOVE +0231 ; Ll # LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON +0233..0239 ; Ll # [7] LATIN SMALL LETTER Y WITH MACRON..LATIN SMALL LETTER QP DIGRAPH +023C ; Ll # LATIN SMALL LETTER C WITH STROKE +023F..0240 ; Ll # [2] LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL +0242 ; Ll # LATIN SMALL LETTER GLOTTAL STOP +0247 ; Ll # LATIN SMALL LETTER E WITH STROKE +0249 ; Ll # LATIN SMALL LETTER J WITH STROKE +024B ; Ll # LATIN SMALL LETTER Q WITH HOOK TAIL +024D ; Ll # LATIN SMALL LETTER R WITH STROKE +024F..0293 ; Ll # [69] LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER EZH WITH CURL +0295..02AF ; Ll # [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL +0371 ; Ll # GREEK SMALL LETTER HETA +0373 ; Ll # GREEK SMALL LETTER ARCHAIC SAMPI +0377 ; Ll # GREEK SMALL LETTER PAMPHYLIAN DIGAMMA +037B..037D ; Ll # [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL +0390 ; Ll # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +03AC..03CE ; Ll # [35] GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER OMEGA WITH TONOS +03D0..03D1 ; Ll # [2] GREEK BETA SYMBOL..GREEK THETA SYMBOL +03D5..03D7 ; Ll # [3] GREEK PHI SYMBOL..GREEK KAI SYMBOL +03D9 ; Ll # GREEK SMALL LETTER ARCHAIC KOPPA +03DB ; Ll # GREEK SMALL LETTER STIGMA +03DD ; Ll # GREEK SMALL LETTER DIGAMMA +03DF ; Ll # GREEK SMALL LETTER KOPPA +03E1 ; Ll # GREEK SMALL LETTER SAMPI +03E3 ; Ll # COPTIC SMALL LETTER SHEI +03E5 ; Ll # COPTIC SMALL LETTER FEI +03E7 ; Ll # COPTIC SMALL LETTER KHEI +03E9 ; Ll # COPTIC SMALL LETTER HORI +03EB ; Ll # COPTIC SMALL LETTER GANGIA +03ED ; Ll # COPTIC SMALL LETTER SHIMA +03EF..03F3 ; Ll # [5] COPTIC SMALL LETTER DEI..GREEK LETTER YOT +03F5 ; Ll # GREEK LUNATE EPSILON SYMBOL +03F8 ; Ll # GREEK SMALL LETTER SHO +03FB..03FC ; Ll # [2] GREEK SMALL LETTER SAN..GREEK RHO WITH STROKE SYMBOL +0430..045F ; Ll # [48] CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE +0461 ; Ll # CYRILLIC SMALL LETTER OMEGA +0463 ; Ll # CYRILLIC SMALL LETTER YAT +0465 ; Ll # CYRILLIC SMALL LETTER IOTIFIED E +0467 ; Ll # CYRILLIC SMALL LETTER LITTLE YUS +0469 ; Ll # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS +046B ; Ll # CYRILLIC SMALL LETTER BIG YUS +046D ; Ll # CYRILLIC SMALL LETTER IOTIFIED BIG YUS +046F ; Ll # CYRILLIC SMALL LETTER KSI +0471 ; Ll # CYRILLIC SMALL LETTER PSI +0473 ; Ll # CYRILLIC SMALL LETTER FITA +0475 ; Ll # CYRILLIC SMALL LETTER IZHITSA +0477 ; Ll # CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0479 ; Ll # CYRILLIC SMALL LETTER UK +047B ; Ll # CYRILLIC SMALL LETTER ROUND OMEGA +047D ; Ll # CYRILLIC SMALL LETTER OMEGA WITH TITLO +047F ; Ll # CYRILLIC SMALL LETTER OT +0481 ; Ll # CYRILLIC SMALL LETTER KOPPA +048B ; Ll # CYRILLIC SMALL LETTER SHORT I WITH TAIL +048D ; Ll # CYRILLIC SMALL LETTER SEMISOFT SIGN +048F ; Ll # CYRILLIC SMALL LETTER ER WITH TICK +0491 ; Ll # CYRILLIC SMALL LETTER GHE WITH UPTURN +0493 ; Ll # CYRILLIC SMALL LETTER GHE WITH STROKE +0495 ; Ll # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK +0497 ; Ll # CYRILLIC SMALL LETTER ZHE WITH DESCENDER +0499 ; Ll # CYRILLIC SMALL LETTER ZE WITH DESCENDER +049B ; Ll # CYRILLIC SMALL LETTER KA WITH DESCENDER +049D ; Ll # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE +049F ; Ll # CYRILLIC SMALL LETTER KA WITH STROKE +04A1 ; Ll # CYRILLIC SMALL LETTER BASHKIR KA +04A3 ; Ll # CYRILLIC SMALL LETTER EN WITH DESCENDER +04A5 ; Ll # CYRILLIC SMALL LIGATURE EN GHE +04A7 ; Ll # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK +04A9 ; Ll # CYRILLIC SMALL LETTER ABKHASIAN HA +04AB ; Ll # CYRILLIC SMALL LETTER ES WITH DESCENDER +04AD ; Ll # CYRILLIC SMALL LETTER TE WITH DESCENDER +04AF ; Ll # CYRILLIC SMALL LETTER STRAIGHT U +04B1 ; Ll # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE +04B3 ; Ll # CYRILLIC SMALL LETTER HA WITH DESCENDER +04B5 ; Ll # CYRILLIC SMALL LIGATURE TE TSE +04B7 ; Ll # CYRILLIC SMALL LETTER CHE WITH DESCENDER +04B9 ; Ll # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE +04BB ; Ll # CYRILLIC SMALL LETTER SHHA +04BD ; Ll # CYRILLIC SMALL LETTER ABKHASIAN CHE +04BF ; Ll # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER +04C2 ; Ll # CYRILLIC SMALL LETTER ZHE WITH BREVE +04C4 ; Ll # CYRILLIC SMALL LETTER KA WITH HOOK +04C6 ; Ll # CYRILLIC SMALL LETTER EL WITH TAIL +04C8 ; Ll # CYRILLIC SMALL LETTER EN WITH HOOK +04CA ; Ll # CYRILLIC SMALL LETTER EN WITH TAIL +04CC ; Ll # CYRILLIC SMALL LETTER KHAKASSIAN CHE +04CE..04CF ; Ll # [2] CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA +04D1 ; Ll # CYRILLIC SMALL LETTER A WITH BREVE +04D3 ; Ll # CYRILLIC SMALL LETTER A WITH DIAERESIS +04D5 ; Ll # CYRILLIC SMALL LIGATURE A IE +04D7 ; Ll # CYRILLIC SMALL LETTER IE WITH BREVE +04D9 ; Ll # CYRILLIC SMALL LETTER SCHWA +04DB ; Ll # CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS +04DD ; Ll # CYRILLIC SMALL LETTER ZHE WITH DIAERESIS +04DF ; Ll # CYRILLIC SMALL LETTER ZE WITH DIAERESIS +04E1 ; Ll # CYRILLIC SMALL LETTER ABKHASIAN DZE +04E3 ; Ll # CYRILLIC SMALL LETTER I WITH MACRON +04E5 ; Ll # CYRILLIC SMALL LETTER I WITH DIAERESIS +04E7 ; Ll # CYRILLIC SMALL LETTER O WITH DIAERESIS +04E9 ; Ll # CYRILLIC SMALL LETTER BARRED O +04EB ; Ll # CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS +04ED ; Ll # CYRILLIC SMALL LETTER E WITH DIAERESIS +04EF ; Ll # CYRILLIC SMALL LETTER U WITH MACRON +04F1 ; Ll # CYRILLIC SMALL LETTER U WITH DIAERESIS +04F3 ; Ll # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE +04F5 ; Ll # CYRILLIC SMALL LETTER CHE WITH DIAERESIS +04F7 ; Ll # CYRILLIC SMALL LETTER GHE WITH DESCENDER +04F9 ; Ll # CYRILLIC SMALL LETTER YERU WITH DIAERESIS +04FB ; Ll # CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK +04FD ; Ll # CYRILLIC SMALL LETTER HA WITH HOOK +04FF ; Ll # CYRILLIC SMALL LETTER HA WITH STROKE +0501 ; Ll # CYRILLIC SMALL LETTER KOMI DE +0503 ; Ll # CYRILLIC SMALL LETTER KOMI DJE +0505 ; Ll # CYRILLIC SMALL LETTER KOMI ZJE +0507 ; Ll # CYRILLIC SMALL LETTER KOMI DZJE +0509 ; Ll # CYRILLIC SMALL LETTER KOMI LJE +050B ; Ll # CYRILLIC SMALL LETTER KOMI NJE +050D ; Ll # CYRILLIC SMALL LETTER KOMI SJE +050F ; Ll # CYRILLIC SMALL LETTER KOMI TJE +0511 ; Ll # CYRILLIC SMALL LETTER REVERSED ZE +0513 ; Ll # CYRILLIC SMALL LETTER EL WITH HOOK +0515 ; Ll # CYRILLIC SMALL LETTER LHA +0517 ; Ll # CYRILLIC SMALL LETTER RHA +0519 ; Ll # CYRILLIC SMALL LETTER YAE +051B ; Ll # CYRILLIC SMALL LETTER QA +051D ; Ll # CYRILLIC SMALL LETTER WE +051F ; Ll # CYRILLIC SMALL LETTER ALEUT KA +0521 ; Ll # CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK +0523 ; Ll # CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK +0525 ; Ll # CYRILLIC SMALL LETTER PE WITH DESCENDER +0527 ; Ll # CYRILLIC SMALL LETTER SHHA WITH DESCENDER +0529 ; Ll # CYRILLIC SMALL LETTER EN WITH LEFT HOOK +052B ; Ll # CYRILLIC SMALL LETTER DZZHE +052D ; Ll # CYRILLIC SMALL LETTER DCHE +052F ; Ll # CYRILLIC SMALL LETTER EL WITH DESCENDER +0560..0588 ; Ll # [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE +10D0..10FA ; Ll # [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN +10FD..10FF ; Ll # [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN +13F8..13FD ; Ll # [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV +1C80..1C88 ; Ll # [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1D00..1D2B ; Ll # [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL +1D6B..1D77 ; Ll # [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G +1D79..1D9A ; Ll # [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK +1E01 ; Ll # LATIN SMALL LETTER A WITH RING BELOW +1E03 ; Ll # LATIN SMALL LETTER B WITH DOT ABOVE +1E05 ; Ll # LATIN SMALL LETTER B WITH DOT BELOW +1E07 ; Ll # LATIN SMALL LETTER B WITH LINE BELOW +1E09 ; Ll # LATIN SMALL LETTER C WITH CEDILLA AND ACUTE +1E0B ; Ll # LATIN SMALL LETTER D WITH DOT ABOVE +1E0D ; Ll # LATIN SMALL LETTER D WITH DOT BELOW +1E0F ; Ll # LATIN SMALL LETTER D WITH LINE BELOW +1E11 ; Ll # LATIN SMALL LETTER D WITH CEDILLA +1E13 ; Ll # LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW +1E15 ; Ll # LATIN SMALL LETTER E WITH MACRON AND GRAVE +1E17 ; Ll # LATIN SMALL LETTER E WITH MACRON AND ACUTE +1E19 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW +1E1B ; Ll # LATIN SMALL LETTER E WITH TILDE BELOW +1E1D ; Ll # LATIN SMALL LETTER E WITH CEDILLA AND BREVE +1E1F ; Ll # LATIN SMALL LETTER F WITH DOT ABOVE +1E21 ; Ll # LATIN SMALL LETTER G WITH MACRON +1E23 ; Ll # LATIN SMALL LETTER H WITH DOT ABOVE +1E25 ; Ll # LATIN SMALL LETTER H WITH DOT BELOW +1E27 ; Ll # LATIN SMALL LETTER H WITH DIAERESIS +1E29 ; Ll # LATIN SMALL LETTER H WITH CEDILLA +1E2B ; Ll # LATIN SMALL LETTER H WITH BREVE BELOW +1E2D ; Ll # LATIN SMALL LETTER I WITH TILDE BELOW +1E2F ; Ll # LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE +1E31 ; Ll # LATIN SMALL LETTER K WITH ACUTE +1E33 ; Ll # LATIN SMALL LETTER K WITH DOT BELOW +1E35 ; Ll # LATIN SMALL LETTER K WITH LINE BELOW +1E37 ; Ll # LATIN SMALL LETTER L WITH DOT BELOW +1E39 ; Ll # LATIN SMALL LETTER L WITH DOT BELOW AND MACRON +1E3B ; Ll # LATIN SMALL LETTER L WITH LINE BELOW +1E3D ; Ll # LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW +1E3F ; Ll # LATIN SMALL LETTER M WITH ACUTE +1E41 ; Ll # LATIN SMALL LETTER M WITH DOT ABOVE +1E43 ; Ll # LATIN SMALL LETTER M WITH DOT BELOW +1E45 ; Ll # LATIN SMALL LETTER N WITH DOT ABOVE +1E47 ; Ll # LATIN SMALL LETTER N WITH DOT BELOW +1E49 ; Ll # LATIN SMALL LETTER N WITH LINE BELOW +1E4B ; Ll # LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW +1E4D ; Ll # LATIN SMALL LETTER O WITH TILDE AND ACUTE +1E4F ; Ll # LATIN SMALL LETTER O WITH TILDE AND DIAERESIS +1E51 ; Ll # LATIN SMALL LETTER O WITH MACRON AND GRAVE +1E53 ; Ll # LATIN SMALL LETTER O WITH MACRON AND ACUTE +1E55 ; Ll # LATIN SMALL LETTER P WITH ACUTE +1E57 ; Ll # LATIN SMALL LETTER P WITH DOT ABOVE +1E59 ; Ll # LATIN SMALL LETTER R WITH DOT ABOVE +1E5B ; Ll # LATIN SMALL LETTER R WITH DOT BELOW +1E5D ; Ll # LATIN SMALL LETTER R WITH DOT BELOW AND MACRON +1E5F ; Ll # LATIN SMALL LETTER R WITH LINE BELOW +1E61 ; Ll # LATIN SMALL LETTER S WITH DOT ABOVE +1E63 ; Ll # LATIN SMALL LETTER S WITH DOT BELOW +1E65 ; Ll # LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE +1E67 ; Ll # LATIN SMALL LETTER S WITH CARON AND DOT ABOVE +1E69 ; Ll # LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6B ; Ll # LATIN SMALL LETTER T WITH DOT ABOVE +1E6D ; Ll # LATIN SMALL LETTER T WITH DOT BELOW +1E6F ; Ll # LATIN SMALL LETTER T WITH LINE BELOW +1E71 ; Ll # LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW +1E73 ; Ll # LATIN SMALL LETTER U WITH DIAERESIS BELOW +1E75 ; Ll # LATIN SMALL LETTER U WITH TILDE BELOW +1E77 ; Ll # LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW +1E79 ; Ll # LATIN SMALL LETTER U WITH TILDE AND ACUTE +1E7B ; Ll # LATIN SMALL LETTER U WITH MACRON AND DIAERESIS +1E7D ; Ll # LATIN SMALL LETTER V WITH TILDE +1E7F ; Ll # LATIN SMALL LETTER V WITH DOT BELOW +1E81 ; Ll # LATIN SMALL LETTER W WITH GRAVE +1E83 ; Ll # LATIN SMALL LETTER W WITH ACUTE +1E85 ; Ll # LATIN SMALL LETTER W WITH DIAERESIS +1E87 ; Ll # LATIN SMALL LETTER W WITH DOT ABOVE +1E89 ; Ll # LATIN SMALL LETTER W WITH DOT BELOW +1E8B ; Ll # LATIN SMALL LETTER X WITH DOT ABOVE +1E8D ; Ll # LATIN SMALL LETTER X WITH DIAERESIS +1E8F ; Ll # LATIN SMALL LETTER Y WITH DOT ABOVE +1E91 ; Ll # LATIN SMALL LETTER Z WITH CIRCUMFLEX +1E93 ; Ll # LATIN SMALL LETTER Z WITH DOT BELOW +1E95..1E9D ; Ll # [9] LATIN SMALL LETTER Z WITH LINE BELOW..LATIN SMALL LETTER LONG S WITH HIGH STROKE +1E9F ; Ll # LATIN SMALL LETTER DELTA +1EA1 ; Ll # LATIN SMALL LETTER A WITH DOT BELOW +1EA3 ; Ll # LATIN SMALL LETTER A WITH HOOK ABOVE +1EA5 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA7 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA9 ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAB ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE +1EAD ; Ll # LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAF ; Ll # LATIN SMALL LETTER A WITH BREVE AND ACUTE +1EB1 ; Ll # LATIN SMALL LETTER A WITH BREVE AND GRAVE +1EB3 ; Ll # LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE +1EB5 ; Ll # LATIN SMALL LETTER A WITH BREVE AND TILDE +1EB7 ; Ll # LATIN SMALL LETTER A WITH BREVE AND DOT BELOW +1EB9 ; Ll # LATIN SMALL LETTER E WITH DOT BELOW +1EBB ; Ll # LATIN SMALL LETTER E WITH HOOK ABOVE +1EBD ; Ll # LATIN SMALL LETTER E WITH TILDE +1EBF ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC1 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC3 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC5 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE +1EC7 ; Ll # LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC9 ; Ll # LATIN SMALL LETTER I WITH HOOK ABOVE +1ECB ; Ll # LATIN SMALL LETTER I WITH DOT BELOW +1ECD ; Ll # LATIN SMALL LETTER O WITH DOT BELOW +1ECF ; Ll # LATIN SMALL LETTER O WITH HOOK ABOVE +1ED1 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED3 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED5 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED7 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE +1ED9 ; Ll # LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDB ; Ll # LATIN SMALL LETTER O WITH HORN AND ACUTE +1EDD ; Ll # LATIN SMALL LETTER O WITH HORN AND GRAVE +1EDF ; Ll # LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE +1EE1 ; Ll # LATIN SMALL LETTER O WITH HORN AND TILDE +1EE3 ; Ll # LATIN SMALL LETTER O WITH HORN AND DOT BELOW +1EE5 ; Ll # LATIN SMALL LETTER U WITH DOT BELOW +1EE7 ; Ll # LATIN SMALL LETTER U WITH HOOK ABOVE +1EE9 ; Ll # LATIN SMALL LETTER U WITH HORN AND ACUTE +1EEB ; Ll # LATIN SMALL LETTER U WITH HORN AND GRAVE +1EED ; Ll # LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE +1EEF ; Ll # LATIN SMALL LETTER U WITH HORN AND TILDE +1EF1 ; Ll # LATIN SMALL LETTER U WITH HORN AND DOT BELOW +1EF3 ; Ll # LATIN SMALL LETTER Y WITH GRAVE +1EF5 ; Ll # LATIN SMALL LETTER Y WITH DOT BELOW +1EF7 ; Ll # LATIN SMALL LETTER Y WITH HOOK ABOVE +1EF9 ; Ll # LATIN SMALL LETTER Y WITH TILDE +1EFB ; Ll # LATIN SMALL LETTER MIDDLE-WELSH LL +1EFD ; Ll # LATIN SMALL LETTER MIDDLE-WELSH V +1EFF..1F07 ; Ll # [9] LATIN SMALL LETTER Y WITH LOOP..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F10..1F15 ; Ll # [6] GREEK SMALL LETTER EPSILON WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA +1F20..1F27 ; Ll # [8] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI +1F30..1F37 ; Ll # [8] GREEK SMALL LETTER IOTA WITH PSILI..GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI +1F40..1F45 ; Ll # [6] GREEK SMALL LETTER OMICRON WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA +1F50..1F57 ; Ll # [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F60..1F67 ; Ll # [8] GREEK SMALL LETTER OMEGA WITH PSILI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F70..1F7D ; Ll # [14] GREEK SMALL LETTER ALPHA WITH VARIA..GREEK SMALL LETTER OMEGA WITH OXIA +1F80..1F87 ; Ll # [8] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F90..1F97 ; Ll # [8] GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA0..1FA7 ; Ll # [8] GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FB0..1FB4 ; Ll # [5] GREEK SMALL LETTER ALPHA WITH VRACHY..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6..1FB7 ; Ll # [2] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FBE ; Ll # GREEK PROSGEGRAMMENI +1FC2..1FC4 ; Ll # [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6..1FC7 ; Ll # [2] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FD0..1FD3 ; Ll # [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6..1FD7 ; Ll # [2] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FE0..1FE7 ; Ll # [8] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FF2..1FF4 ; Ll # [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6..1FF7 ; Ll # [2] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +210A ; Ll # SCRIPT SMALL G +210E..210F ; Ll # [2] PLANCK CONSTANT..PLANCK CONSTANT OVER TWO PI +2113 ; Ll # SCRIPT SMALL L +212F ; Ll # SCRIPT SMALL E +2134 ; Ll # SCRIPT SMALL O +2139 ; Ll # INFORMATION SOURCE +213C..213D ; Ll # [2] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK SMALL GAMMA +2146..2149 ; Ll # [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J +214E ; Ll # TURNED SMALL F +2184 ; Ll # LATIN SMALL LETTER REVERSED C +2C30..2C5E ; Ll # [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE +2C61 ; Ll # LATIN SMALL LETTER L WITH DOUBLE BAR +2C65..2C66 ; Ll # [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE +2C68 ; Ll # LATIN SMALL LETTER H WITH DESCENDER +2C6A ; Ll # LATIN SMALL LETTER K WITH DESCENDER +2C6C ; Ll # LATIN SMALL LETTER Z WITH DESCENDER +2C71 ; Ll # LATIN SMALL LETTER V WITH RIGHT HOOK +2C73..2C74 ; Ll # [2] LATIN SMALL LETTER W WITH HOOK..LATIN SMALL LETTER V WITH CURL +2C76..2C7B ; Ll # [6] LATIN SMALL LETTER HALF H..LATIN LETTER SMALL CAPITAL TURNED E +2C81 ; Ll # COPTIC SMALL LETTER ALFA +2C83 ; Ll # COPTIC SMALL LETTER VIDA +2C85 ; Ll # COPTIC SMALL LETTER GAMMA +2C87 ; Ll # COPTIC SMALL LETTER DALDA +2C89 ; Ll # COPTIC SMALL LETTER EIE +2C8B ; Ll # COPTIC SMALL LETTER SOU +2C8D ; Ll # COPTIC SMALL LETTER ZATA +2C8F ; Ll # COPTIC SMALL LETTER HATE +2C91 ; Ll # COPTIC SMALL LETTER THETHE +2C93 ; Ll # COPTIC SMALL LETTER IAUDA +2C95 ; Ll # COPTIC SMALL LETTER KAPA +2C97 ; Ll # COPTIC SMALL LETTER LAULA +2C99 ; Ll # COPTIC SMALL LETTER MI +2C9B ; Ll # COPTIC SMALL LETTER NI +2C9D ; Ll # COPTIC SMALL LETTER KSI +2C9F ; Ll # COPTIC SMALL LETTER O +2CA1 ; Ll # COPTIC SMALL LETTER PI +2CA3 ; Ll # COPTIC SMALL LETTER RO +2CA5 ; Ll # COPTIC SMALL LETTER SIMA +2CA7 ; Ll # COPTIC SMALL LETTER TAU +2CA9 ; Ll # COPTIC SMALL LETTER UA +2CAB ; Ll # COPTIC SMALL LETTER FI +2CAD ; Ll # COPTIC SMALL LETTER KHI +2CAF ; Ll # COPTIC SMALL LETTER PSI +2CB1 ; Ll # COPTIC SMALL LETTER OOU +2CB3 ; Ll # COPTIC SMALL LETTER DIALECT-P ALEF +2CB5 ; Ll # COPTIC SMALL LETTER OLD COPTIC AIN +2CB7 ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC EIE +2CB9 ; Ll # COPTIC SMALL LETTER DIALECT-P KAPA +2CBB ; Ll # COPTIC SMALL LETTER DIALECT-P NI +2CBD ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC NI +2CBF ; Ll # COPTIC SMALL LETTER OLD COPTIC OOU +2CC1 ; Ll # COPTIC SMALL LETTER SAMPI +2CC3 ; Ll # COPTIC SMALL LETTER CROSSED SHEI +2CC5 ; Ll # COPTIC SMALL LETTER OLD COPTIC SHEI +2CC7 ; Ll # COPTIC SMALL LETTER OLD COPTIC ESH +2CC9 ; Ll # COPTIC SMALL LETTER AKHMIMIC KHEI +2CCB ; Ll # COPTIC SMALL LETTER DIALECT-P HORI +2CCD ; Ll # COPTIC SMALL LETTER OLD COPTIC HORI +2CCF ; Ll # COPTIC SMALL LETTER OLD COPTIC HA +2CD1 ; Ll # COPTIC SMALL LETTER L-SHAPED HA +2CD3 ; Ll # COPTIC SMALL LETTER OLD COPTIC HEI +2CD5 ; Ll # COPTIC SMALL LETTER OLD COPTIC HAT +2CD7 ; Ll # COPTIC SMALL LETTER OLD COPTIC GANGIA +2CD9 ; Ll # COPTIC SMALL LETTER OLD COPTIC DJA +2CDB ; Ll # COPTIC SMALL LETTER OLD COPTIC SHIMA +2CDD ; Ll # COPTIC SMALL LETTER OLD NUBIAN SHIMA +2CDF ; Ll # COPTIC SMALL LETTER OLD NUBIAN NGI +2CE1 ; Ll # COPTIC SMALL LETTER OLD NUBIAN NYI +2CE3..2CE4 ; Ll # [2] COPTIC SMALL LETTER OLD NUBIAN WAU..COPTIC SYMBOL KAI +2CEC ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI +2CEE ; Ll # COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA +2CF3 ; Ll # COPTIC SMALL LETTER BOHAIRIC KHEI +2D00..2D25 ; Ll # [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE +2D27 ; Ll # GEORGIAN SMALL LETTER YN +2D2D ; Ll # GEORGIAN SMALL LETTER AEN +A641 ; Ll # CYRILLIC SMALL LETTER ZEMLYA +A643 ; Ll # CYRILLIC SMALL LETTER DZELO +A645 ; Ll # CYRILLIC SMALL LETTER REVERSED DZE +A647 ; Ll # CYRILLIC SMALL LETTER IOTA +A649 ; Ll # CYRILLIC SMALL LETTER DJERV +A64B ; Ll # CYRILLIC SMALL LETTER MONOGRAPH UK +A64D ; Ll # CYRILLIC SMALL LETTER BROAD OMEGA +A64F ; Ll # CYRILLIC SMALL LETTER NEUTRAL YER +A651 ; Ll # CYRILLIC SMALL LETTER YERU WITH BACK YER +A653 ; Ll # CYRILLIC SMALL LETTER IOTIFIED YAT +A655 ; Ll # CYRILLIC SMALL LETTER REVERSED YU +A657 ; Ll # CYRILLIC SMALL LETTER IOTIFIED A +A659 ; Ll # CYRILLIC SMALL LETTER CLOSED LITTLE YUS +A65B ; Ll # CYRILLIC SMALL LETTER BLENDED YUS +A65D ; Ll # CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS +A65F ; Ll # CYRILLIC SMALL LETTER YN +A661 ; Ll # CYRILLIC SMALL LETTER REVERSED TSE +A663 ; Ll # CYRILLIC SMALL LETTER SOFT DE +A665 ; Ll # CYRILLIC SMALL LETTER SOFT EL +A667 ; Ll # CYRILLIC SMALL LETTER SOFT EM +A669 ; Ll # CYRILLIC SMALL LETTER MONOCULAR O +A66B ; Ll # CYRILLIC SMALL LETTER BINOCULAR O +A66D ; Ll # CYRILLIC SMALL LETTER DOUBLE MONOCULAR O +A681 ; Ll # CYRILLIC SMALL LETTER DWE +A683 ; Ll # CYRILLIC SMALL LETTER DZWE +A685 ; Ll # CYRILLIC SMALL LETTER ZHWE +A687 ; Ll # CYRILLIC SMALL LETTER CCHE +A689 ; Ll # CYRILLIC SMALL LETTER DZZE +A68B ; Ll # CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK +A68D ; Ll # CYRILLIC SMALL LETTER TWE +A68F ; Ll # CYRILLIC SMALL LETTER TSWE +A691 ; Ll # CYRILLIC SMALL LETTER TSSE +A693 ; Ll # CYRILLIC SMALL LETTER TCHE +A695 ; Ll # CYRILLIC SMALL LETTER HWE +A697 ; Ll # CYRILLIC SMALL LETTER SHWE +A699 ; Ll # CYRILLIC SMALL LETTER DOUBLE O +A69B ; Ll # CYRILLIC SMALL LETTER CROSSED O +A723 ; Ll # LATIN SMALL LETTER EGYPTOLOGICAL ALEF +A725 ; Ll # LATIN SMALL LETTER EGYPTOLOGICAL AIN +A727 ; Ll # LATIN SMALL LETTER HENG +A729 ; Ll # LATIN SMALL LETTER TZ +A72B ; Ll # LATIN SMALL LETTER TRESILLO +A72D ; Ll # LATIN SMALL LETTER CUATRILLO +A72F..A731 ; Ll # [3] LATIN SMALL LETTER CUATRILLO WITH COMMA..LATIN LETTER SMALL CAPITAL S +A733 ; Ll # LATIN SMALL LETTER AA +A735 ; Ll # LATIN SMALL LETTER AO +A737 ; Ll # LATIN SMALL LETTER AU +A739 ; Ll # LATIN SMALL LETTER AV +A73B ; Ll # LATIN SMALL LETTER AV WITH HORIZONTAL BAR +A73D ; Ll # LATIN SMALL LETTER AY +A73F ; Ll # LATIN SMALL LETTER REVERSED C WITH DOT +A741 ; Ll # LATIN SMALL LETTER K WITH STROKE +A743 ; Ll # LATIN SMALL LETTER K WITH DIAGONAL STROKE +A745 ; Ll # LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE +A747 ; Ll # LATIN SMALL LETTER BROKEN L +A749 ; Ll # LATIN SMALL LETTER L WITH HIGH STROKE +A74B ; Ll # LATIN SMALL LETTER O WITH LONG STROKE OVERLAY +A74D ; Ll # LATIN SMALL LETTER O WITH LOOP +A74F ; Ll # LATIN SMALL LETTER OO +A751 ; Ll # LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER +A753 ; Ll # LATIN SMALL LETTER P WITH FLOURISH +A755 ; Ll # LATIN SMALL LETTER P WITH SQUIRREL TAIL +A757 ; Ll # LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER +A759 ; Ll # LATIN SMALL LETTER Q WITH DIAGONAL STROKE +A75B ; Ll # LATIN SMALL LETTER R ROTUNDA +A75D ; Ll # LATIN SMALL LETTER RUM ROTUNDA +A75F ; Ll # LATIN SMALL LETTER V WITH DIAGONAL STROKE +A761 ; Ll # LATIN SMALL LETTER VY +A763 ; Ll # LATIN SMALL LETTER VISIGOTHIC Z +A765 ; Ll # LATIN SMALL LETTER THORN WITH STROKE +A767 ; Ll # LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER +A769 ; Ll # LATIN SMALL LETTER VEND +A76B ; Ll # LATIN SMALL LETTER ET +A76D ; Ll # LATIN SMALL LETTER IS +A76F ; Ll # LATIN SMALL LETTER CON +A771..A778 ; Ll # [8] LATIN SMALL LETTER DUM..LATIN SMALL LETTER UM +A77A ; Ll # LATIN SMALL LETTER INSULAR D +A77C ; Ll # LATIN SMALL LETTER INSULAR F +A77F ; Ll # LATIN SMALL LETTER TURNED INSULAR G +A781 ; Ll # LATIN SMALL LETTER TURNED L +A783 ; Ll # LATIN SMALL LETTER INSULAR R +A785 ; Ll # LATIN SMALL LETTER INSULAR S +A787 ; Ll # LATIN SMALL LETTER INSULAR T +A78C ; Ll # LATIN SMALL LETTER SALTILLO +A78E ; Ll # LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT +A791 ; Ll # LATIN SMALL LETTER N WITH DESCENDER +A793..A795 ; Ll # [3] LATIN SMALL LETTER C WITH BAR..LATIN SMALL LETTER H WITH PALATAL HOOK +A797 ; Ll # LATIN SMALL LETTER B WITH FLOURISH +A799 ; Ll # LATIN SMALL LETTER F WITH STROKE +A79B ; Ll # LATIN SMALL LETTER VOLAPUK AE +A79D ; Ll # LATIN SMALL LETTER VOLAPUK OE +A79F ; Ll # LATIN SMALL LETTER VOLAPUK UE +A7A1 ; Ll # LATIN SMALL LETTER G WITH OBLIQUE STROKE +A7A3 ; Ll # LATIN SMALL LETTER K WITH OBLIQUE STROKE +A7A5 ; Ll # LATIN SMALL LETTER N WITH OBLIQUE STROKE +A7A7 ; Ll # LATIN SMALL LETTER R WITH OBLIQUE STROKE +A7A9 ; Ll # LATIN SMALL LETTER S WITH OBLIQUE STROKE +A7AF ; Ll # LATIN LETTER SMALL CAPITAL Q +A7B5 ; Ll # LATIN SMALL LETTER BETA +A7B7 ; Ll # LATIN SMALL LETTER OMEGA +A7B9 ; Ll # LATIN SMALL LETTER U WITH STROKE +A7BB ; Ll # LATIN SMALL LETTER GLOTTAL A +A7BD ; Ll # LATIN SMALL LETTER GLOTTAL I +A7BF ; Ll # LATIN SMALL LETTER GLOTTAL U +A7C3 ; Ll # LATIN SMALL LETTER ANGLICANA W +A7FA ; Ll # LATIN LETTER SMALL CAPITAL TURNED M +AB30..AB5A ; Ll # [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG +AB60..AB67 ; Ll # [8] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TS DIGRAPH WITH RETROFLEX HOOK +AB70..ABBF ; Ll # [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA +FB00..FB06 ; Ll # [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST +FB13..FB17 ; Ll # [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH +FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z +10428..1044F ; Ll # [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW +104D8..104FB ; Ll # [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA +10CC0..10CF2 ; Ll # [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US +118C0..118DF ; Ll # [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO +16E60..16E7F ; Ll # [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y +1D41A..1D433 ; Ll # [26] MATHEMATICAL BOLD SMALL A..MATHEMATICAL BOLD SMALL Z +1D44E..1D454 ; Ll # [7] MATHEMATICAL ITALIC SMALL A..MATHEMATICAL ITALIC SMALL G +1D456..1D467 ; Ll # [18] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL Z +1D482..1D49B ; Ll # [26] MATHEMATICAL BOLD ITALIC SMALL A..MATHEMATICAL BOLD ITALIC SMALL Z +1D4B6..1D4B9 ; Ll # [4] MATHEMATICAL SCRIPT SMALL A..MATHEMATICAL SCRIPT SMALL D +1D4BB ; Ll # MATHEMATICAL SCRIPT SMALL F +1D4BD..1D4C3 ; Ll # [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N +1D4C5..1D4CF ; Ll # [11] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL SCRIPT SMALL Z +1D4EA..1D503 ; Ll # [26] MATHEMATICAL BOLD SCRIPT SMALL A..MATHEMATICAL BOLD SCRIPT SMALL Z +1D51E..1D537 ; Ll # [26] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL FRAKTUR SMALL Z +1D552..1D56B ; Ll # [26] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL DOUBLE-STRUCK SMALL Z +1D586..1D59F ; Ll # [26] MATHEMATICAL BOLD FRAKTUR SMALL A..MATHEMATICAL BOLD FRAKTUR SMALL Z +1D5BA..1D5D3 ; Ll # [26] MATHEMATICAL SANS-SERIF SMALL A..MATHEMATICAL SANS-SERIF SMALL Z +1D5EE..1D607 ; Ll # [26] MATHEMATICAL SANS-SERIF BOLD SMALL A..MATHEMATICAL SANS-SERIF BOLD SMALL Z +1D622..1D63B ; Ll # [26] MATHEMATICAL SANS-SERIF ITALIC SMALL A..MATHEMATICAL SANS-SERIF ITALIC SMALL Z +1D656..1D66F ; Ll # [26] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL A..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL Z +1D68A..1D6A5 ; Ll # [28] MATHEMATICAL MONOSPACE SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J +1D6C2..1D6DA ; Ll # [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA +1D6DC..1D6E1 ; Ll # [6] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL BOLD PI SYMBOL +1D6FC..1D714 ; Ll # [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA +1D716..1D71B ; Ll # [6] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL ITALIC PI SYMBOL +1D736..1D74E ; Ll # [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA +1D750..1D755 ; Ll # [6] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC PI SYMBOL +1D770..1D788 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA +1D78A..1D78F ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD PI SYMBOL +1D7AA..1D7C2 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA +1D7C4..1D7C9 ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL +1D7CB ; Ll # MATHEMATICAL BOLD SMALL DIGAMMA +1E922..1E943 ; Ll # [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA + +# Total code points: 2151 + +# ================================================ + +# General_Category=Titlecase_Letter + +01C5 ; Lt # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C8 ; Lt # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CB ; Lt # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01F2 ; Lt # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +1F88..1F8F ; Lt # [8] GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F98..1F9F ; Lt # [8] GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA8..1FAF ; Lt # [8] GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI..GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FBC ; Lt # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FCC ; Lt # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FFC ; Lt # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + +# Total code points: 31 + +# ================================================ + +# General_Category=Modifier_Letter + +02B0..02C1 ; Lm # [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP +02C6..02D1 ; Lm # [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON +02E0..02E4 ; Lm # [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +02EC ; Lm # MODIFIER LETTER VOICING +02EE ; Lm # MODIFIER LETTER DOUBLE APOSTROPHE +0374 ; Lm # GREEK NUMERAL SIGN +037A ; Lm # GREEK YPOGEGRAMMENI +0559 ; Lm # ARMENIAN MODIFIER LETTER LEFT HALF RING +0640 ; Lm # ARABIC TATWEEL +06E5..06E6 ; Lm # [2] ARABIC SMALL WAW..ARABIC SMALL YEH +07F4..07F5 ; Lm # [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE +07FA ; Lm # NKO LAJANYALAN +081A ; Lm # SAMARITAN MODIFIER LETTER EPENTHETIC YUT +0824 ; Lm # SAMARITAN MODIFIER LETTER SHORT A +0828 ; Lm # SAMARITAN MODIFIER LETTER I +0971 ; Lm # DEVANAGARI SIGN HIGH SPACING DOT +0E46 ; Lm # THAI CHARACTER MAIYAMOK +0EC6 ; Lm # LAO KO LA +10FC ; Lm # MODIFIER LETTER GEORGIAN NAR +17D7 ; Lm # KHMER SIGN LEK TOO +1843 ; Lm # MONGOLIAN LETTER TODO LONG VOWEL SIGN +1AA7 ; Lm # TAI THAM SIGN MAI YAMOK +1C78..1C7D ; Lm # [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD +1D2C..1D6A ; Lm # [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI +1D78 ; Lm # MODIFIER LETTER CYRILLIC EN +1D9B..1DBF ; Lm # [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA +2071 ; Lm # SUPERSCRIPT LATIN SMALL LETTER I +207F ; Lm # SUPERSCRIPT LATIN SMALL LETTER N +2090..209C ; Lm # [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T +2C7C..2C7D ; Lm # [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V +2D6F ; Lm # TIFINAGH MODIFIER LETTER LABIALIZATION MARK +2E2F ; Lm # VERTICAL TILDE +3005 ; Lm # IDEOGRAPHIC ITERATION MARK +3031..3035 ; Lm # [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF +303B ; Lm # VERTICAL IDEOGRAPHIC ITERATION MARK +309D..309E ; Lm # [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK +30FC..30FE ; Lm # [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK +A015 ; Lm # YI SYLLABLE WU +A4F8..A4FD ; Lm # [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU +A60C ; Lm # VAI SYLLABLE LENGTHENER +A67F ; Lm # CYRILLIC PAYEROK +A69C..A69D ; Lm # [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN +A717..A71F ; Lm # [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK +A770 ; Lm # MODIFIER LETTER US +A788 ; Lm # MODIFIER LETTER LOW CIRCUMFLEX ACCENT +A7F8..A7F9 ; Lm # [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE +A9CF ; Lm # JAVANESE PANGRANGKEP +A9E6 ; Lm # MYANMAR MODIFIER LETTER SHAN REDUPLICATION +AA70 ; Lm # MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION +AADD ; Lm # TAI VIET SYMBOL SAM +AAF3..AAF4 ; Lm # [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK +AB5C..AB5F ; Lm # [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK +FF70 ; Lm # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +16B40..16B43 ; Lm # [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM +16F93..16F9F ; Lm # [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 +16FE0..16FE1 ; Lm # [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK +16FE3 ; Lm # OLD CHINESE ITERATION MARK +1E137..1E13D ; Lm # [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER +1E94B ; Lm # ADLAM NASALIZATION MARK + +# Total code points: 259 + +# ================================================ + +# General_Category=Other_Letter + +00AA ; Lo # FEMININE ORDINAL INDICATOR +00BA ; Lo # MASCULINE ORDINAL INDICATOR +01BB ; Lo # LATIN LETTER TWO WITH STROKE +01C0..01C3 ; Lo # [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK +0294 ; Lo # LATIN LETTER GLOTTAL STOP +05D0..05EA ; Lo # [27] HEBREW LETTER ALEF..HEBREW LETTER TAV +05EF..05F2 ; Lo # [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD +0620..063F ; Lo # [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE +0641..064A ; Lo # [10] ARABIC LETTER FEH..ARABIC LETTER YEH +066E..066F ; Lo # [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF +0671..06D3 ; Lo # [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE +06D5 ; Lo # ARABIC LETTER AE +06EE..06EF ; Lo # [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V +06FA..06FC ; Lo # [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW +06FF ; Lo # ARABIC LETTER HEH WITH INVERTED V +0710 ; Lo # SYRIAC LETTER ALAPH +0712..072F ; Lo # [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH +074D..07A5 ; Lo # [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU +07B1 ; Lo # THAANA LETTER NAA +07CA..07EA ; Lo # [33] NKO LETTER A..NKO LETTER JONA RA +0800..0815 ; Lo # [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF +0840..0858 ; Lo # [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN +0860..086A ; Lo # [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA +08A0..08B4 ; Lo # [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW +08B6..08BD ; Lo # [8] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER AFRICAN NOON +0904..0939 ; Lo # [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA +093D ; Lo # DEVANAGARI SIGN AVAGRAHA +0950 ; Lo # DEVANAGARI OM +0958..0961 ; Lo # [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL +0972..0980 ; Lo # [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI +0985..098C ; Lo # [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L +098F..0990 ; Lo # [2] BENGALI LETTER E..BENGALI LETTER AI +0993..09A8 ; Lo # [22] BENGALI LETTER O..BENGALI LETTER NA +09AA..09B0 ; Lo # [7] BENGALI LETTER PA..BENGALI LETTER RA +09B2 ; Lo # BENGALI LETTER LA +09B6..09B9 ; Lo # [4] BENGALI LETTER SHA..BENGALI LETTER HA +09BD ; Lo # BENGALI SIGN AVAGRAHA +09CE ; Lo # BENGALI LETTER KHANDA TA +09DC..09DD ; Lo # [2] BENGALI LETTER RRA..BENGALI LETTER RHA +09DF..09E1 ; Lo # [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL +09F0..09F1 ; Lo # [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL +09FC ; Lo # BENGALI LETTER VEDIC ANUSVARA +0A05..0A0A ; Lo # [6] GURMUKHI LETTER A..GURMUKHI LETTER UU +0A0F..0A10 ; Lo # [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI +0A13..0A28 ; Lo # [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA +0A2A..0A30 ; Lo # [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA +0A32..0A33 ; Lo # [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA +0A35..0A36 ; Lo # [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA +0A38..0A39 ; Lo # [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA +0A59..0A5C ; Lo # [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA +0A5E ; Lo # GURMUKHI LETTER FA +0A72..0A74 ; Lo # [3] GURMUKHI IRI..GURMUKHI EK ONKAR +0A85..0A8D ; Lo # [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E +0A8F..0A91 ; Lo # [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O +0A93..0AA8 ; Lo # [22] GUJARATI LETTER O..GUJARATI LETTER NA +0AAA..0AB0 ; Lo # [7] GUJARATI LETTER PA..GUJARATI LETTER RA +0AB2..0AB3 ; Lo # [2] GUJARATI LETTER LA..GUJARATI LETTER LLA +0AB5..0AB9 ; Lo # [5] GUJARATI LETTER VA..GUJARATI LETTER HA +0ABD ; Lo # GUJARATI SIGN AVAGRAHA +0AD0 ; Lo # GUJARATI OM +0AE0..0AE1 ; Lo # [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL +0AF9 ; Lo # GUJARATI LETTER ZHA +0B05..0B0C ; Lo # [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L +0B0F..0B10 ; Lo # [2] ORIYA LETTER E..ORIYA LETTER AI +0B13..0B28 ; Lo # [22] ORIYA LETTER O..ORIYA LETTER NA +0B2A..0B30 ; Lo # [7] ORIYA LETTER PA..ORIYA LETTER RA +0B32..0B33 ; Lo # [2] ORIYA LETTER LA..ORIYA LETTER LLA +0B35..0B39 ; Lo # [5] ORIYA LETTER VA..ORIYA LETTER HA +0B3D ; Lo # ORIYA SIGN AVAGRAHA +0B5C..0B5D ; Lo # [2] ORIYA LETTER RRA..ORIYA LETTER RHA +0B5F..0B61 ; Lo # [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL +0B71 ; Lo # ORIYA LETTER WA +0B83 ; Lo # TAMIL SIGN VISARGA +0B85..0B8A ; Lo # [6] TAMIL LETTER A..TAMIL LETTER UU +0B8E..0B90 ; Lo # [3] TAMIL LETTER E..TAMIL LETTER AI +0B92..0B95 ; Lo # [4] TAMIL LETTER O..TAMIL LETTER KA +0B99..0B9A ; Lo # [2] TAMIL LETTER NGA..TAMIL LETTER CA +0B9C ; Lo # TAMIL LETTER JA +0B9E..0B9F ; Lo # [2] TAMIL LETTER NYA..TAMIL LETTER TTA +0BA3..0BA4 ; Lo # [2] TAMIL LETTER NNA..TAMIL LETTER TA +0BA8..0BAA ; Lo # [3] TAMIL LETTER NA..TAMIL LETTER PA +0BAE..0BB9 ; Lo # [12] TAMIL LETTER MA..TAMIL LETTER HA +0BD0 ; Lo # TAMIL OM +0C05..0C0C ; Lo # [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L +0C0E..0C10 ; Lo # [3] TELUGU LETTER E..TELUGU LETTER AI +0C12..0C28 ; Lo # [23] TELUGU LETTER O..TELUGU LETTER NA +0C2A..0C39 ; Lo # [16] TELUGU LETTER PA..TELUGU LETTER HA +0C3D ; Lo # TELUGU SIGN AVAGRAHA +0C58..0C5A ; Lo # [3] TELUGU LETTER TSA..TELUGU LETTER RRRA +0C60..0C61 ; Lo # [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL +0C80 ; Lo # KANNADA SIGN SPACING CANDRABINDU +0C85..0C8C ; Lo # [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L +0C8E..0C90 ; Lo # [3] KANNADA LETTER E..KANNADA LETTER AI +0C92..0CA8 ; Lo # [23] KANNADA LETTER O..KANNADA LETTER NA +0CAA..0CB3 ; Lo # [10] KANNADA LETTER PA..KANNADA LETTER LLA +0CB5..0CB9 ; Lo # [5] KANNADA LETTER VA..KANNADA LETTER HA +0CBD ; Lo # KANNADA SIGN AVAGRAHA +0CDE ; Lo # KANNADA LETTER FA +0CE0..0CE1 ; Lo # [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL +0CF1..0CF2 ; Lo # [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA +0D05..0D0C ; Lo # [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L +0D0E..0D10 ; Lo # [3] MALAYALAM LETTER E..MALAYALAM LETTER AI +0D12..0D3A ; Lo # [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA +0D3D ; Lo # MALAYALAM SIGN AVAGRAHA +0D4E ; Lo # MALAYALAM LETTER DOT REPH +0D54..0D56 ; Lo # [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL +0D5F..0D61 ; Lo # [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL +0D7A..0D7F ; Lo # [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K +0D85..0D96 ; Lo # [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA +0D9A..0DB1 ; Lo # [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA +0DB3..0DBB ; Lo # [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA +0DBD ; Lo # SINHALA LETTER DANTAJA LAYANNA +0DC0..0DC6 ; Lo # [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA +0E01..0E30 ; Lo # [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A +0E32..0E33 ; Lo # [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM +0E40..0E45 ; Lo # [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO +0E81..0E82 ; Lo # [2] LAO LETTER KO..LAO LETTER KHO SUNG +0E84 ; Lo # LAO LETTER KHO TAM +0E86..0E8A ; Lo # [5] LAO LETTER PALI GHA..LAO LETTER SO TAM +0E8C..0EA3 ; Lo # [24] LAO LETTER PALI JHA..LAO LETTER LO LING +0EA5 ; Lo # LAO LETTER LO LOOT +0EA7..0EB0 ; Lo # [10] LAO LETTER WO..LAO VOWEL SIGN A +0EB2..0EB3 ; Lo # [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM +0EBD ; Lo # LAO SEMIVOWEL SIGN NYO +0EC0..0EC4 ; Lo # [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI +0EDC..0EDF ; Lo # [4] LAO HO NO..LAO LETTER KHMU NYO +0F00 ; Lo # TIBETAN SYLLABLE OM +0F40..0F47 ; Lo # [8] TIBETAN LETTER KA..TIBETAN LETTER JA +0F49..0F6C ; Lo # [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA +0F88..0F8C ; Lo # [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN +1000..102A ; Lo # [43] MYANMAR LETTER KA..MYANMAR LETTER AU +103F ; Lo # MYANMAR LETTER GREAT SA +1050..1055 ; Lo # [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL +105A..105D ; Lo # [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE +1061 ; Lo # MYANMAR LETTER SGAW KAREN SHA +1065..1066 ; Lo # [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA +106E..1070 ; Lo # [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA +1075..1081 ; Lo # [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA +108E ; Lo # MYANMAR LETTER RUMAI PALAUNG FA +1100..1248 ; Lo # [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA +124A..124D ; Lo # [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE +1250..1256 ; Lo # [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO +1258 ; Lo # ETHIOPIC SYLLABLE QHWA +125A..125D ; Lo # [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE +1260..1288 ; Lo # [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA +128A..128D ; Lo # [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE +1290..12B0 ; Lo # [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA +12B2..12B5 ; Lo # [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE +12B8..12BE ; Lo # [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO +12C0 ; Lo # ETHIOPIC SYLLABLE KXWA +12C2..12C5 ; Lo # [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE +12C8..12D6 ; Lo # [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O +12D8..1310 ; Lo # [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA +1312..1315 ; Lo # [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE +1318..135A ; Lo # [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA +1380..138F ; Lo # [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE +1401..166C ; Lo # [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA +166F..167F ; Lo # [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W +1681..169A ; Lo # [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH +16A0..16EA ; Lo # [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X +16F1..16F8 ; Lo # [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC +1700..170C ; Lo # [13] TAGALOG LETTER A..TAGALOG LETTER YA +170E..1711 ; Lo # [4] TAGALOG LETTER LA..TAGALOG LETTER HA +1720..1731 ; Lo # [18] HANUNOO LETTER A..HANUNOO LETTER HA +1740..1751 ; Lo # [18] BUHID LETTER A..BUHID LETTER HA +1760..176C ; Lo # [13] TAGBANWA LETTER A..TAGBANWA LETTER YA +176E..1770 ; Lo # [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA +1780..17B3 ; Lo # [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU +17DC ; Lo # KHMER SIGN AVAKRAHASANYA +1820..1842 ; Lo # [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI +1844..1878 ; Lo # [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS +1880..1884 ; Lo # [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA +1887..18A8 ; Lo # [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA +18AA ; Lo # MONGOLIAN LETTER MANCHU ALI GALI LHA +18B0..18F5 ; Lo # [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S +1900..191E ; Lo # [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA +1950..196D ; Lo # [30] TAI LE LETTER KA..TAI LE LETTER AI +1970..1974 ; Lo # [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 +1980..19AB ; Lo # [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA +19B0..19C9 ; Lo # [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 +1A00..1A16 ; Lo # [23] BUGINESE LETTER KA..BUGINESE LETTER HA +1A20..1A54 ; Lo # [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA +1B05..1B33 ; Lo # [47] BALINESE LETTER AKARA..BALINESE LETTER HA +1B45..1B4B ; Lo # [7] BALINESE LETTER KAF SASAK..BALINESE LETTER ASYURA SASAK +1B83..1BA0 ; Lo # [30] SUNDANESE LETTER A..SUNDANESE LETTER HA +1BAE..1BAF ; Lo # [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA +1BBA..1BE5 ; Lo # [44] SUNDANESE AVAGRAHA..BATAK LETTER U +1C00..1C23 ; Lo # [36] LEPCHA LETTER KA..LEPCHA LETTER A +1C4D..1C4F ; Lo # [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA +1C5A..1C77 ; Lo # [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH +1CE9..1CEC ; Lo # [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL +1CEE..1CF3 ; Lo # [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA +1CF5..1CF6 ; Lo # [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA +1CFA ; Lo # VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA +2135..2138 ; Lo # [4] ALEF SYMBOL..DALET SYMBOL +2D30..2D67 ; Lo # [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO +2D80..2D96 ; Lo # [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE +2DA0..2DA6 ; Lo # [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO +2DA8..2DAE ; Lo # [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO +2DB0..2DB6 ; Lo # [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO +2DB8..2DBE ; Lo # [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO +2DC0..2DC6 ; Lo # [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO +2DC8..2DCE ; Lo # [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO +2DD0..2DD6 ; Lo # [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO +2DD8..2DDE ; Lo # [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO +3006 ; Lo # IDEOGRAPHIC CLOSING MARK +303C ; Lo # MASU MARK +3041..3096 ; Lo # [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE +309F ; Lo # HIRAGANA DIGRAPH YORI +30A1..30FA ; Lo # [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO +30FF ; Lo # KATAKANA DIGRAPH KOTO +3105..312F ; Lo # [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN +3131..318E ; Lo # [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE +31A0..31BA ; Lo # [27] BOPOMOFO LETTER BU..BOPOMOFO LETTER ZY +31F0..31FF ; Lo # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO +3400..4DB5 ; Lo # [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5 +4E00..9FEF ; Lo # [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF +A000..A014 ; Lo # [21] YI SYLLABLE IT..YI SYLLABLE E +A016..A48C ; Lo # [1143] YI SYLLABLE BIT..YI SYLLABLE YYR +A4D0..A4F7 ; Lo # [40] LISU LETTER BA..LISU LETTER OE +A500..A60B ; Lo # [268] VAI SYLLABLE EE..VAI SYLLABLE NG +A610..A61F ; Lo # [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG +A62A..A62B ; Lo # [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO +A66E ; Lo # CYRILLIC LETTER MULTIOCULAR O +A6A0..A6E5 ; Lo # [70] BAMUM LETTER A..BAMUM LETTER KI +A78F ; Lo # LATIN LETTER SINOLOGICAL DOT +A7F7 ; Lo # LATIN EPIGRAPHIC LETTER SIDEWAYS I +A7FB..A801 ; Lo # [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I +A803..A805 ; Lo # [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O +A807..A80A ; Lo # [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO +A80C..A822 ; Lo # [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO +A840..A873 ; Lo # [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU +A882..A8B3 ; Lo # [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA +A8F2..A8F7 ; Lo # [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA +A8FB ; Lo # DEVANAGARI HEADSTROKE +A8FD..A8FE ; Lo # [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY +A90A..A925 ; Lo # [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO +A930..A946 ; Lo # [23] REJANG LETTER KA..REJANG LETTER A +A960..A97C ; Lo # [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH +A984..A9B2 ; Lo # [47] JAVANESE LETTER A..JAVANESE LETTER HA +A9E0..A9E4 ; Lo # [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA +A9E7..A9EF ; Lo # [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA +A9FA..A9FE ; Lo # [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA +AA00..AA28 ; Lo # [41] CHAM LETTER A..CHAM LETTER HA +AA40..AA42 ; Lo # [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG +AA44..AA4B ; Lo # [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS +AA60..AA6F ; Lo # [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA +AA71..AA76 ; Lo # [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM +AA7A ; Lo # MYANMAR LETTER AITON RA +AA7E..AAAF ; Lo # [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O +AAB1 ; Lo # TAI VIET VOWEL AA +AAB5..AAB6 ; Lo # [2] TAI VIET VOWEL E..TAI VIET VOWEL O +AAB9..AABD ; Lo # [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN +AAC0 ; Lo # TAI VIET TONE MAI NUENG +AAC2 ; Lo # TAI VIET TONE MAI SONG +AADB..AADC ; Lo # [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG +AAE0..AAEA ; Lo # [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA +AAF2 ; Lo # MEETEI MAYEK ANJI +AB01..AB06 ; Lo # [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO +AB09..AB0E ; Lo # [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO +AB11..AB16 ; Lo # [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO +AB20..AB26 ; Lo # [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO +AB28..AB2E ; Lo # [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO +ABC0..ABE2 ; Lo # [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM +AC00..D7A3 ; Lo # [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH +D7B0..D7C6 ; Lo # [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E +D7CB..D7FB ; Lo # [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH +F900..FA6D ; Lo # [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D +FA70..FAD9 ; Lo # [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 +FB1D ; Lo # HEBREW LETTER YOD WITH HIRIQ +FB1F..FB28 ; Lo # [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV +FB2A..FB36 ; Lo # [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH +FB38..FB3C ; Lo # [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH +FB3E ; Lo # HEBREW LETTER MEM WITH DAGESH +FB40..FB41 ; Lo # [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH +FB43..FB44 ; Lo # [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH +FB46..FBB1 ; Lo # [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM +FBD3..FD3D ; Lo # [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM +FD50..FD8F ; Lo # [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM +FD92..FDC7 ; Lo # [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM +FDF0..FDFB ; Lo # [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU +FE70..FE74 ; Lo # [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM +FE76..FEFC ; Lo # [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM +FF66..FF6F ; Lo # [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU +FF71..FF9D ; Lo # [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N +FFA0..FFBE ; Lo # [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH +FFC2..FFC7 ; Lo # [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E +FFCA..FFCF ; Lo # [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE +FFD2..FFD7 ; Lo # [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU +FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I +10000..1000B ; Lo # [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE +1000D..10026 ; Lo # [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO +10028..1003A ; Lo # [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO +1003C..1003D ; Lo # [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE +1003F..1004D ; Lo # [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO +10050..1005D ; Lo # [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 +10080..100FA ; Lo # [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 +10280..1029C ; Lo # [29] LYCIAN LETTER A..LYCIAN LETTER X +102A0..102D0 ; Lo # [49] CARIAN LETTER A..CARIAN LETTER UUU3 +10300..1031F ; Lo # [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS +1032D..10340 ; Lo # [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA +10342..10349 ; Lo # [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL +10350..10375 ; Lo # [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA +10380..1039D ; Lo # [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU +103A0..103C3 ; Lo # [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA +103C8..103CF ; Lo # [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH +10450..1049D ; Lo # [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO +10500..10527 ; Lo # [40] ELBASAN LETTER A..ELBASAN LETTER KHE +10530..10563 ; Lo # [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW +10600..10736 ; Lo # [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 +10740..10755 ; Lo # [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE +10760..10767 ; Lo # [8] LINEAR A SIGN A800..LINEAR A SIGN A807 +10800..10805 ; Lo # [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA +10808 ; Lo # CYPRIOT SYLLABLE JO +1080A..10835 ; Lo # [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO +10837..10838 ; Lo # [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE +1083C ; Lo # CYPRIOT SYLLABLE ZA +1083F..10855 ; Lo # [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW +10860..10876 ; Lo # [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW +10880..1089E ; Lo # [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW +108E0..108F2 ; Lo # [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH +108F4..108F5 ; Lo # [2] HATRAN LETTER SHIN..HATRAN LETTER TAW +10900..10915 ; Lo # [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU +10920..10939 ; Lo # [26] LYDIAN LETTER A..LYDIAN LETTER C +10980..109B7 ; Lo # [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA +109BE..109BF ; Lo # [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN +10A00 ; Lo # KHAROSHTHI LETTER A +10A10..10A13 ; Lo # [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA +10A15..10A17 ; Lo # [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA +10A19..10A35 ; Lo # [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA +10A60..10A7C ; Lo # [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH +10A80..10A9C ; Lo # [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH +10AC0..10AC7 ; Lo # [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW +10AC9..10AE4 ; Lo # [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW +10B00..10B35 ; Lo # [54] AVESTAN LETTER A..AVESTAN LETTER HE +10B40..10B55 ; Lo # [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW +10B60..10B72 ; Lo # [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW +10B80..10B91 ; Lo # [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW +10C00..10C48 ; Lo # [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH +10D00..10D23 ; Lo # [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA +10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL +10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH +10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN +10FE0..10FF6 ; Lo # [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH +11003..11037 ; Lo # [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA +11083..110AF ; Lo # [45] KAITHI LETTER A..KAITHI LETTER HA +110D0..110E8 ; Lo # [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE +11103..11126 ; Lo # [36] CHAKMA LETTER AA..CHAKMA LETTER HAA +11144 ; Lo # CHAKMA LETTER LHAA +11150..11172 ; Lo # [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA +11176 ; Lo # MAHAJANI LIGATURE SHRI +11183..111B2 ; Lo # [48] SHARADA LETTER A..SHARADA LETTER HA +111C1..111C4 ; Lo # [4] SHARADA SIGN AVAGRAHA..SHARADA OM +111DA ; Lo # SHARADA EKAM +111DC ; Lo # SHARADA HEADSTROKE +11200..11211 ; Lo # [18] KHOJKI LETTER A..KHOJKI LETTER JJA +11213..1122B ; Lo # [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA +11280..11286 ; Lo # [7] MULTANI LETTER A..MULTANI LETTER GA +11288 ; Lo # MULTANI LETTER GHA +1128A..1128D ; Lo # [4] MULTANI LETTER CA..MULTANI LETTER JJA +1128F..1129D ; Lo # [15] MULTANI LETTER NYA..MULTANI LETTER BA +1129F..112A8 ; Lo # [10] MULTANI LETTER BHA..MULTANI LETTER RHA +112B0..112DE ; Lo # [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA +11305..1130C ; Lo # [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L +1130F..11310 ; Lo # [2] GRANTHA LETTER EE..GRANTHA LETTER AI +11313..11328 ; Lo # [22] GRANTHA LETTER OO..GRANTHA LETTER NA +1132A..11330 ; Lo # [7] GRANTHA LETTER PA..GRANTHA LETTER RA +11332..11333 ; Lo # [2] GRANTHA LETTER LA..GRANTHA LETTER LLA +11335..11339 ; Lo # [5] GRANTHA LETTER VA..GRANTHA LETTER HA +1133D ; Lo # GRANTHA SIGN AVAGRAHA +11350 ; Lo # GRANTHA OM +1135D..11361 ; Lo # [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL +11400..11434 ; Lo # [53] NEWA LETTER A..NEWA LETTER HA +11447..1144A ; Lo # [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI +1145F ; Lo # NEWA LETTER VEDIC ANUSVARA +11480..114AF ; Lo # [48] TIRHUTA ANJI..TIRHUTA LETTER HA +114C4..114C5 ; Lo # [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG +114C7 ; Lo # TIRHUTA OM +11580..115AE ; Lo # [47] SIDDHAM LETTER A..SIDDHAM LETTER HA +115D8..115DB ; Lo # [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U +11600..1162F ; Lo # [48] MODI LETTER A..MODI LETTER LLA +11644 ; Lo # MODI SIGN HUVA +11680..116AA ; Lo # [43] TAKRI LETTER A..TAKRI LETTER RRA +116B8 ; Lo # TAKRI LETTER ARCHAIC KHA +11700..1171A ; Lo # [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA +11800..1182B ; Lo # [44] DOGRA LETTER A..DOGRA LETTER RRA +118FF ; Lo # WARANG CITI OM +119A0..119A7 ; Lo # [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR +119AA..119D0 ; Lo # [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA +119E1 ; Lo # NANDINAGARI SIGN AVAGRAHA +119E3 ; Lo # NANDINAGARI HEADSTROKE +11A00 ; Lo # ZANABAZAR SQUARE LETTER A +11A0B..11A32 ; Lo # [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA +11A3A ; Lo # ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA +11A50 ; Lo # SOYOMBO LETTER A +11A5C..11A89 ; Lo # [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA +11A9D ; Lo # SOYOMBO MARK PLUTA +11AC0..11AF8 ; Lo # [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL +11C00..11C08 ; Lo # [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L +11C0A..11C2E ; Lo # [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA +11C40 ; Lo # BHAIKSUKI SIGN AVAGRAHA +11C72..11C8F ; Lo # [30] MARCHEN LETTER KA..MARCHEN LETTER A +11D00..11D06 ; Lo # [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E +11D08..11D09 ; Lo # [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O +11D0B..11D30 ; Lo # [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA +11D46 ; Lo # MASARAM GONDI REPHA +11D60..11D65 ; Lo # [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU +11D67..11D68 ; Lo # [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI +11D6A..11D89 ; Lo # [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA +11D98 ; Lo # GUNJALA GONDI OM +11EE0..11EF2 ; Lo # [19] MAKASAR LETTER KA..MAKASAR ANGKA +12000..12399 ; Lo # [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U +12480..12543 ; Lo # [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU +13000..1342E ; Lo # [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 +14400..14646 ; Lo # [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 +16800..16A38 ; Lo # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ +16A40..16A5E ; Lo # [31] MRO LETTER TA..MRO LETTER TEK +16AD0..16AED ; Lo # [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I +16B00..16B2F ; Lo # [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU +16B63..16B77 ; Lo # [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS +16B7D..16B8F ; Lo # [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ +16F00..16F4A ; Lo # [75] MIAO LETTER PA..MIAO LETTER RTE +16F50 ; Lo # MIAO LETTER NASALIZATION +17000..187F7 ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 +18800..18AF2 ; Lo # [755] TANGUT COMPONENT-001..TANGUT COMPONENT-755 +1B000..1B11E ; Lo # [287] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER N-MU-MO-2 +1B150..1B152 ; Lo # [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO +1B164..1B167 ; Lo # [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N +1B170..1B2FB ; Lo # [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB +1BC00..1BC6A ; Lo # [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M +1BC70..1BC7C ; Lo # [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK +1BC80..1BC88 ; Lo # [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL +1BC90..1BC99 ; Lo # [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW +1E100..1E12C ; Lo # [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W +1E14E ; Lo # NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ +1E2C0..1E2EB ; Lo # [44] WANCHO LETTER AA..WANCHO LETTER YIH +1E800..1E8C4 ; Lo # [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON +1EE00..1EE03 ; Lo # [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL +1EE05..1EE1F ; Lo # [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF +1EE21..1EE22 ; Lo # [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM +1EE24 ; Lo # ARABIC MATHEMATICAL INITIAL HEH +1EE27 ; Lo # ARABIC MATHEMATICAL INITIAL HAH +1EE29..1EE32 ; Lo # [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF +1EE34..1EE37 ; Lo # [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH +1EE39 ; Lo # ARABIC MATHEMATICAL INITIAL DAD +1EE3B ; Lo # ARABIC MATHEMATICAL INITIAL GHAIN +1EE42 ; Lo # ARABIC MATHEMATICAL TAILED JEEM +1EE47 ; Lo # ARABIC MATHEMATICAL TAILED HAH +1EE49 ; Lo # ARABIC MATHEMATICAL TAILED YEH +1EE4B ; Lo # ARABIC MATHEMATICAL TAILED LAM +1EE4D..1EE4F ; Lo # [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN +1EE51..1EE52 ; Lo # [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF +1EE54 ; Lo # ARABIC MATHEMATICAL TAILED SHEEN +1EE57 ; Lo # ARABIC MATHEMATICAL TAILED KHAH +1EE59 ; Lo # ARABIC MATHEMATICAL TAILED DAD +1EE5B ; Lo # ARABIC MATHEMATICAL TAILED GHAIN +1EE5D ; Lo # ARABIC MATHEMATICAL TAILED DOTLESS NOON +1EE5F ; Lo # ARABIC MATHEMATICAL TAILED DOTLESS QAF +1EE61..1EE62 ; Lo # [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM +1EE64 ; Lo # ARABIC MATHEMATICAL STRETCHED HEH +1EE67..1EE6A ; Lo # [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF +1EE6C..1EE72 ; Lo # [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF +1EE74..1EE77 ; Lo # [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH +1EE79..1EE7C ; Lo # [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH +1EE7E ; Lo # ARABIC MATHEMATICAL STRETCHED DOTLESS FEH +1EE80..1EE89 ; Lo # [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH +1EE8B..1EE9B ; Lo # [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN +1EEA1..1EEA3 ; Lo # [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL +1EEA5..1EEA9 ; Lo # [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH +1EEAB..1EEBB ; Lo # [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN +20000..2A6D6 ; Lo # [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6 +2A700..2B734 ; Lo # [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734 +2B740..2B81D ; Lo # [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D +2B820..2CEA1 ; Lo # [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 +2CEB0..2EBE0 ; Lo # [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 +2F800..2FA1D ; Lo # [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D + +# Total code points: 121414 + +# ================================================ + +# General_Category=Nonspacing_Mark + +0300..036F ; Mn # [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X +0483..0487 ; Mn # [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE +0591..05BD ; Mn # [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG +05BF ; Mn # HEBREW POINT RAFE +05C1..05C2 ; Mn # [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT +05C4..05C5 ; Mn # [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT +05C7 ; Mn # HEBREW POINT QAMATS QATAN +0610..061A ; Mn # [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA +064B..065F ; Mn # [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW +0670 ; Mn # ARABIC LETTER SUPERSCRIPT ALEF +06D6..06DC ; Mn # [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN +06DF..06E4 ; Mn # [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA +06E7..06E8 ; Mn # [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON +06EA..06ED ; Mn # [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM +0711 ; Mn # SYRIAC LETTER SUPERSCRIPT ALAPH +0730..074A ; Mn # [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH +07A6..07B0 ; Mn # [11] THAANA ABAFILI..THAANA SUKUN +07EB..07F3 ; Mn # [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE +07FD ; Mn # NKO DANTAYALAN +0816..0819 ; Mn # [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH +081B..0823 ; Mn # [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A +0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U +0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA +0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK +08D3..08E1 ; Mn # [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA +08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA +093A ; Mn # DEVANAGARI VOWEL SIGN OE +093C ; Mn # DEVANAGARI SIGN NUKTA +0941..0948 ; Mn # [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI +094D ; Mn # DEVANAGARI SIGN VIRAMA +0951..0957 ; Mn # [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE +0962..0963 ; Mn # [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL +0981 ; Mn # BENGALI SIGN CANDRABINDU +09BC ; Mn # BENGALI SIGN NUKTA +09C1..09C4 ; Mn # [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR +09CD ; Mn # BENGALI SIGN VIRAMA +09E2..09E3 ; Mn # [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL +09FE ; Mn # BENGALI SANDHI MARK +0A01..0A02 ; Mn # [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI +0A3C ; Mn # GURMUKHI SIGN NUKTA +0A41..0A42 ; Mn # [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU +0A47..0A48 ; Mn # [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI +0A4B..0A4D ; Mn # [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA +0A51 ; Mn # GURMUKHI SIGN UDAAT +0A70..0A71 ; Mn # [2] GURMUKHI TIPPI..GURMUKHI ADDAK +0A75 ; Mn # GURMUKHI SIGN YAKASH +0A81..0A82 ; Mn # [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA +0ABC ; Mn # GUJARATI SIGN NUKTA +0AC1..0AC5 ; Mn # [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E +0AC7..0AC8 ; Mn # [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI +0ACD ; Mn # GUJARATI SIGN VIRAMA +0AE2..0AE3 ; Mn # [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL +0AFA..0AFF ; Mn # [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE +0B01 ; Mn # ORIYA SIGN CANDRABINDU +0B3C ; Mn # ORIYA SIGN NUKTA +0B3F ; Mn # ORIYA VOWEL SIGN I +0B41..0B44 ; Mn # [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR +0B4D ; Mn # ORIYA SIGN VIRAMA +0B56 ; Mn # ORIYA AI LENGTH MARK +0B62..0B63 ; Mn # [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL +0B82 ; Mn # TAMIL SIGN ANUSVARA +0BC0 ; Mn # TAMIL VOWEL SIGN II +0BCD ; Mn # TAMIL SIGN VIRAMA +0C00 ; Mn # TELUGU SIGN COMBINING CANDRABINDU ABOVE +0C04 ; Mn # TELUGU SIGN COMBINING ANUSVARA ABOVE +0C3E..0C40 ; Mn # [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II +0C46..0C48 ; Mn # [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI +0C4A..0C4D ; Mn # [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA +0C55..0C56 ; Mn # [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK +0C62..0C63 ; Mn # [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +0C81 ; Mn # KANNADA SIGN CANDRABINDU +0CBC ; Mn # KANNADA SIGN NUKTA +0CBF ; Mn # KANNADA VOWEL SIGN I +0CC6 ; Mn # KANNADA VOWEL SIGN E +0CCC..0CCD ; Mn # [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA +0CE2..0CE3 ; Mn # [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +0D00..0D01 ; Mn # [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU +0D3B..0D3C ; Mn # [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA +0D41..0D44 ; Mn # [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR +0D4D ; Mn # MALAYALAM SIGN VIRAMA +0D62..0D63 ; Mn # [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL +0DCA ; Mn # SINHALA SIGN AL-LAKUNA +0DD2..0DD4 ; Mn # [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA +0DD6 ; Mn # SINHALA VOWEL SIGN DIGA PAA-PILLA +0E31 ; Mn # THAI CHARACTER MAI HAN-AKAT +0E34..0E3A ; Mn # [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU +0E47..0E4E ; Mn # [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN +0EB1 ; Mn # LAO VOWEL SIGN MAI KAN +0EB4..0EBC ; Mn # [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO +0EC8..0ECD ; Mn # [6] LAO TONE MAI EK..LAO NIGGAHITA +0F18..0F19 ; Mn # [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS +0F35 ; Mn # TIBETAN MARK NGAS BZUNG NYI ZLA +0F37 ; Mn # TIBETAN MARK NGAS BZUNG SGOR RTAGS +0F39 ; Mn # TIBETAN MARK TSA -PHRU +0F71..0F7E ; Mn # [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO +0F80..0F84 ; Mn # [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA +0F86..0F87 ; Mn # [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS +0F8D..0F97 ; Mn # [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA +0F99..0FBC ; Mn # [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA +0FC6 ; Mn # TIBETAN SYMBOL PADMA GDAN +102D..1030 ; Mn # [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU +1032..1037 ; Mn # [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW +1039..103A ; Mn # [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT +103D..103E ; Mn # [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA +1058..1059 ; Mn # [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL +105E..1060 ; Mn # [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA +1071..1074 ; Mn # [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE +1082 ; Mn # MYANMAR CONSONANT SIGN SHAN MEDIAL WA +1085..1086 ; Mn # [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y +108D ; Mn # MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE +109D ; Mn # MYANMAR VOWEL SIGN AITON AI +135D..135F ; Mn # [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK +1712..1714 ; Mn # [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA +1732..1734 ; Mn # [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD +1752..1753 ; Mn # [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U +1772..1773 ; Mn # [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U +17B4..17B5 ; Mn # [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA +17B7..17BD ; Mn # [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA +17C6 ; Mn # KHMER SIGN NIKAHIT +17C9..17D3 ; Mn # [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT +17DD ; Mn # KHMER SIGN ATTHACAN +180B..180D ; Mn # [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE +1885..1886 ; Mn # [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA +18A9 ; Mn # MONGOLIAN LETTER ALI GALI DAGALGA +1920..1922 ; Mn # [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U +1927..1928 ; Mn # [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O +1932 ; Mn # LIMBU SMALL LETTER ANUSVARA +1939..193B ; Mn # [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I +1A17..1A18 ; Mn # [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U +1A1B ; Mn # BUGINESE VOWEL SIGN AE +1A56 ; Mn # TAI THAM CONSONANT SIGN MEDIAL LA +1A58..1A5E ; Mn # [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA +1A60 ; Mn # TAI THAM SIGN SAKOT +1A62 ; Mn # TAI THAM VOWEL SIGN MAI SAT +1A65..1A6C ; Mn # [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW +1A73..1A7C ; Mn # [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN +1A7F ; Mn # TAI THAM COMBINING CRYPTOGRAMMIC DOT +1AB0..1ABD ; Mn # [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW +1B00..1B03 ; Mn # [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG +1B34 ; Mn # BALINESE SIGN REREKAN +1B36..1B3A ; Mn # [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA +1B3C ; Mn # BALINESE VOWEL SIGN LA LENGA +1B42 ; Mn # BALINESE VOWEL SIGN PEPET +1B6B..1B73 ; Mn # [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG +1B80..1B81 ; Mn # [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR +1BA2..1BA5 ; Mn # [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU +1BA8..1BA9 ; Mn # [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG +1BAB..1BAD ; Mn # [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BE6 ; Mn # BATAK SIGN TOMPI +1BE8..1BE9 ; Mn # [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE +1BED ; Mn # BATAK VOWEL SIGN KARO O +1BEF..1BF1 ; Mn # [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H +1C2C..1C33 ; Mn # [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T +1C36..1C37 ; Mn # [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA +1CD0..1CD2 ; Mn # [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA +1CD4..1CE0 ; Mn # [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA +1CE2..1CE8 ; Mn # [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL +1CED ; Mn # VEDIC SIGN TIRYAK +1CF4 ; Mn # VEDIC TONE CANDRA ABOVE +1CF8..1CF9 ; Mn # [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +1DC0..1DF9 ; Mn # [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW +1DFB..1DFF ; Mn # [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW +20D0..20DC ; Mn # [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE +20E1 ; Mn # COMBINING LEFT RIGHT ARROW ABOVE +20E5..20F0 ; Mn # [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE +2CEF..2CF1 ; Mn # [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS +2D7F ; Mn # TIFINAGH CONSONANT JOINER +2DE0..2DFF ; Mn # [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS +302A..302D ; Mn # [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK +3099..309A ; Mn # [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +A66F ; Mn # COMBINING CYRILLIC VZMET +A674..A67D ; Mn # [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK +A69E..A69F ; Mn # [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E +A6F0..A6F1 ; Mn # [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS +A802 ; Mn # SYLOTI NAGRI SIGN DVISVARA +A806 ; Mn # SYLOTI NAGRI SIGN HASANTA +A80B ; Mn # SYLOTI NAGRI SIGN ANUSVARA +A825..A826 ; Mn # [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E +A8C4..A8C5 ; Mn # [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU +A8E0..A8F1 ; Mn # [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA +A8FF ; Mn # DEVANAGARI VOWEL SIGN AY +A926..A92D ; Mn # [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU +A947..A951 ; Mn # [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R +A980..A982 ; Mn # [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR +A9B3 ; Mn # JAVANESE SIGN CECAK TELU +A9B6..A9B9 ; Mn # [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT +A9BC..A9BD ; Mn # [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET +A9E5 ; Mn # MYANMAR SIGN SHAN SAW +AA29..AA2E ; Mn # [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE +AA31..AA32 ; Mn # [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE +AA35..AA36 ; Mn # [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA +AA43 ; Mn # CHAM CONSONANT SIGN FINAL NG +AA4C ; Mn # CHAM CONSONANT SIGN FINAL M +AA7C ; Mn # MYANMAR SIGN TAI LAING TONE-2 +AAB0 ; Mn # TAI VIET MAI KANG +AAB2..AAB4 ; Mn # [3] TAI VIET VOWEL I..TAI VIET VOWEL U +AAB7..AAB8 ; Mn # [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA +AABE..AABF ; Mn # [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK +AAC1 ; Mn # TAI VIET TONE MAI THO +AAEC..AAED ; Mn # [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI +AAF6 ; Mn # MEETEI MAYEK VIRAMA +ABE5 ; Mn # MEETEI MAYEK VOWEL SIGN ANAP +ABE8 ; Mn # MEETEI MAYEK VOWEL SIGN UNAP +ABED ; Mn # MEETEI MAYEK APUN IYEK +FB1E ; Mn # HEBREW POINT JUDEO-SPANISH VARIKA +FE00..FE0F ; Mn # [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 +FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF +101FD ; Mn # PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +102E0 ; Mn # COPTIC EPACT THOUSANDS MARK +10376..1037A ; Mn # [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII +10A01..10A03 ; Mn # [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R +10A05..10A06 ; Mn # [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O +10A0C..10A0F ; Mn # [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA +10A38..10A3A ; Mn # [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW +10A3F ; Mn # KHAROSHTHI VIRAMA +10AE5..10AE6 ; Mn # [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW +10D24..10D27 ; Mn # [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI +10F46..10F50 ; Mn # [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW +11001 ; Mn # BRAHMI SIGN ANUSVARA +11038..11046 ; Mn # [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA +1107F..11081 ; Mn # [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA +110B3..110B6 ; Mn # [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI +110B9..110BA ; Mn # [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA +11100..11102 ; Mn # [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA +11127..1112B ; Mn # [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU +1112D..11134 ; Mn # [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA +11173 ; Mn # MAHAJANI SIGN NUKTA +11180..11181 ; Mn # [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA +111B6..111BE ; Mn # [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O +111C9..111CC ; Mn # [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK +1122F..11231 ; Mn # [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11234 ; Mn # KHOJKI SIGN ANUSVARA +11236..11237 ; Mn # [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA +1123E ; Mn # KHOJKI SIGN SUKUN +112DF ; Mn # KHUDAWADI SIGN ANUSVARA +112E3..112EA ; Mn # [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA +11300..11301 ; Mn # [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU +1133B..1133C ; Mn # [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA +11340 ; Mn # GRANTHA VOWEL SIGN II +11366..1136C ; Mn # [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; Mn # [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA +11438..1143F ; Mn # [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI +11442..11444 ; Mn # [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA +11446 ; Mn # NEWA SIGN NUKTA +1145E ; Mn # NEWA SANDHI MARK +114B3..114B8 ; Mn # [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114BA ; Mn # TIRHUTA VOWEL SIGN SHORT E +114BF..114C0 ; Mn # [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C2..114C3 ; Mn # [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +115B2..115B5 ; Mn # [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115BC..115BD ; Mn # [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BF..115C0 ; Mn # [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +115DC..115DD ; Mn # [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU +11633..1163A ; Mn # [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163D ; Mn # MODI SIGN ANUSVARA +1163F..11640 ; Mn # [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA +116AB ; Mn # TAKRI SIGN ANUSVARA +116AD ; Mn # TAKRI VOWEL SIGN AA +116B0..116B5 ; Mn # [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU +116B7 ; Mn # TAKRI SIGN NUKTA +1171D..1171F ; Mn # [3] AHOM CONSONANT SIGN MEDIAL LA..AHOM CONSONANT SIGN MEDIAL LIGATING RA +11722..11725 ; Mn # [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU +11727..1172B ; Mn # [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER +1182F..11837 ; Mn # [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA +11839..1183A ; Mn # [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA +119D4..119D7 ; Mn # [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR +119DA..119DB ; Mn # [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI +119E0 ; Mn # NANDINAGARI SIGN VIRAMA +11A01..11A0A ; Mn # [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK +11A33..11A38 ; Mn # [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA +11A3B..11A3E ; Mn # [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA +11A47 ; Mn # ZANABAZAR SQUARE SUBJOINER +11A51..11A56 ; Mn # [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE +11A59..11A5B ; Mn # [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK +11A8A..11A96 ; Mn # [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA +11A98..11A99 ; Mn # [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER +11C30..11C36 ; Mn # [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L +11C38..11C3D ; Mn # [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA +11C3F ; Mn # BHAIKSUKI SIGN VIRAMA +11C92..11CA7 ; Mn # [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA +11CAA..11CB0 ; Mn # [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA +11CB2..11CB3 ; Mn # [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E +11CB5..11CB6 ; Mn # [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU +11D31..11D36 ; Mn # [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R +11D3A ; Mn # MASARAM GONDI VOWEL SIGN E +11D3C..11D3D ; Mn # [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O +11D3F..11D45 ; Mn # [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA +11D47 ; Mn # MASARAM GONDI RA-KARA +11D90..11D91 ; Mn # [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI +11D95 ; Mn # GUNJALA GONDI SIGN ANUSVARA +11D97 ; Mn # GUNJALA GONDI VIRAMA +11EF3..11EF4 ; Mn # [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U +16AF0..16AF4 ; Mn # [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE +16B30..16B36 ; Mn # [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM +16F4F ; Mn # MIAO SIGN CONSONANT MODIFIER BAR +16F8F..16F92 ; Mn # [4] MIAO TONE RIGHT..MIAO TONE BELOW +1BC9D..1BC9E ; Mn # [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK +1D167..1D169 ; Mn # [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 +1D17B..1D182 ; Mn # [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE +1D185..1D18B ; Mn # [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE +1D1AA..1D1AD ; Mn # [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO +1D242..1D244 ; Mn # [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME +1DA00..1DA36 ; Mn # [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN +1DA3B..1DA6C ; Mn # [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT +1DA75 ; Mn # SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS +1DA84 ; Mn # SIGNWRITING LOCATION HEAD NECK +1DA9B..1DA9F ; Mn # [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 +1DAA1..1DAAF ; Mn # [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 +1E000..1E006 ; Mn # [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE +1E008..1E018 ; Mn # [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU +1E01B..1E021 ; Mn # [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI +1E023..1E024 ; Mn # [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS +1E026..1E02A ; Mn # [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA +1E130..1E136 ; Mn # [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D +1E2EC..1E2EF ; Mn # [4] WANCHO TONE TUP..WANCHO TONE KOINI +1E8D0..1E8D6 ; Mn # [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS +1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA +E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 + +# Total code points: 1826 + +# ================================================ + +# General_Category=Enclosing_Mark + +0488..0489 ; Me # [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN +1ABE ; Me # COMBINING PARENTHESES OVERLAY +20DD..20E0 ; Me # [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH +20E2..20E4 ; Me # [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE +A670..A672 ; Me # [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN + +# Total code points: 13 + +# ================================================ + +# General_Category=Spacing_Mark + +0903 ; Mc # DEVANAGARI SIGN VISARGA +093B ; Mc # DEVANAGARI VOWEL SIGN OOE +093E..0940 ; Mc # [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II +0949..094C ; Mc # [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU +094E..094F ; Mc # [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW +0982..0983 ; Mc # [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA +09BE..09C0 ; Mc # [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II +09C7..09C8 ; Mc # [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI +09CB..09CC ; Mc # [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU +09D7 ; Mc # BENGALI AU LENGTH MARK +0A03 ; Mc # GURMUKHI SIGN VISARGA +0A3E..0A40 ; Mc # [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II +0A83 ; Mc # GUJARATI SIGN VISARGA +0ABE..0AC0 ; Mc # [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II +0AC9 ; Mc # GUJARATI VOWEL SIGN CANDRA O +0ACB..0ACC ; Mc # [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU +0B02..0B03 ; Mc # [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA +0B3E ; Mc # ORIYA VOWEL SIGN AA +0B40 ; Mc # ORIYA VOWEL SIGN II +0B47..0B48 ; Mc # [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI +0B4B..0B4C ; Mc # [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU +0B57 ; Mc # ORIYA AU LENGTH MARK +0BBE..0BBF ; Mc # [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I +0BC1..0BC2 ; Mc # [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU +0BC6..0BC8 ; Mc # [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI +0BCA..0BCC ; Mc # [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU +0BD7 ; Mc # TAMIL AU LENGTH MARK +0C01..0C03 ; Mc # [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA +0C41..0C44 ; Mc # [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR +0C82..0C83 ; Mc # [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA +0CBE ; Mc # KANNADA VOWEL SIGN AA +0CC0..0CC4 ; Mc # [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR +0CC7..0CC8 ; Mc # [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI +0CCA..0CCB ; Mc # [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO +0CD5..0CD6 ; Mc # [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK +0D02..0D03 ; Mc # [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA +0D3E..0D40 ; Mc # [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II +0D46..0D48 ; Mc # [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI +0D4A..0D4C ; Mc # [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU +0D57 ; Mc # MALAYALAM AU LENGTH MARK +0D82..0D83 ; Mc # [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA +0DCF..0DD1 ; Mc # [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA +0DD8..0DDF ; Mc # [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA +0DF2..0DF3 ; Mc # [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA +0F3E..0F3F ; Mc # [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES +0F7F ; Mc # TIBETAN SIGN RNAM BCAD +102B..102C ; Mc # [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA +1031 ; Mc # MYANMAR VOWEL SIGN E +1038 ; Mc # MYANMAR SIGN VISARGA +103B..103C ; Mc # [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA +1056..1057 ; Mc # [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR +1062..1064 ; Mc # [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO +1067..106D ; Mc # [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 +1083..1084 ; Mc # [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E +1087..108C ; Mc # [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 +108F ; Mc # MYANMAR SIGN RUMAI PALAUNG TONE-5 +109A..109C ; Mc # [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A +17B6 ; Mc # KHMER VOWEL SIGN AA +17BE..17C5 ; Mc # [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU +17C7..17C8 ; Mc # [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU +1923..1926 ; Mc # [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU +1929..192B ; Mc # [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA +1930..1931 ; Mc # [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA +1933..1938 ; Mc # [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA +1A19..1A1A ; Mc # [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A55 ; Mc # TAI THAM CONSONANT SIGN MEDIAL RA +1A57 ; Mc # TAI THAM CONSONANT SIGN LA TANG LAI +1A61 ; Mc # TAI THAM VOWEL SIGN A +1A63..1A64 ; Mc # [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA +1A6D..1A72 ; Mc # [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI +1B04 ; Mc # BALINESE SIGN BISAH +1B35 ; Mc # BALINESE VOWEL SIGN TEDUNG +1B3B ; Mc # BALINESE VOWEL SIGN RA REPA TEDUNG +1B3D..1B41 ; Mc # [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG +1B43..1B44 ; Mc # [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG +1B82 ; Mc # SUNDANESE SIGN PANGWISAD +1BA1 ; Mc # SUNDANESE CONSONANT SIGN PAMINGKAL +1BA6..1BA7 ; Mc # [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG +1BAA ; Mc # SUNDANESE SIGN PAMAAEH +1BE7 ; Mc # BATAK VOWEL SIGN E +1BEA..1BEC ; Mc # [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O +1BEE ; Mc # BATAK VOWEL SIGN U +1BF2..1BF3 ; Mc # [2] BATAK PANGOLAT..BATAK PANONGONAN +1C24..1C2B ; Mc # [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU +1C34..1C35 ; Mc # [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG +1CE1 ; Mc # VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA +1CF7 ; Mc # VEDIC SIGN ATIKRAMA +302E..302F ; Mc # [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK +A823..A824 ; Mc # [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I +A827 ; Mc # SYLOTI NAGRI VOWEL SIGN OO +A880..A881 ; Mc # [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA +A8B4..A8C3 ; Mc # [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU +A952..A953 ; Mc # [2] REJANG CONSONANT SIGN H..REJANG VIRAMA +A983 ; Mc # JAVANESE SIGN WIGNYAN +A9B4..A9B5 ; Mc # [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG +A9BA..A9BB ; Mc # [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE +A9BE..A9C0 ; Mc # [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON +AA2F..AA30 ; Mc # [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI +AA33..AA34 ; Mc # [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA +AA4D ; Mc # CHAM CONSONANT SIGN FINAL H +AA7B ; Mc # MYANMAR SIGN PAO KAREN TONE +AA7D ; Mc # MYANMAR SIGN TAI LAING TONE-5 +AAEB ; Mc # MEETEI MAYEK VOWEL SIGN II +AAEE..AAEF ; Mc # [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU +AAF5 ; Mc # MEETEI MAYEK VOWEL SIGN VISARGA +ABE3..ABE4 ; Mc # [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP +ABE6..ABE7 ; Mc # [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP +ABE9..ABEA ; Mc # [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG +ABEC ; Mc # MEETEI MAYEK LUM IYEK +11000 ; Mc # BRAHMI SIGN CANDRABINDU +11002 ; Mc # BRAHMI SIGN VISARGA +11082 ; Mc # KAITHI SIGN VISARGA +110B0..110B2 ; Mc # [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II +110B7..110B8 ; Mc # [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU +1112C ; Mc # CHAKMA VOWEL SIGN E +11145..11146 ; Mc # [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI +11182 ; Mc # SHARADA SIGN VISARGA +111B3..111B5 ; Mc # [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II +111BF..111C0 ; Mc # [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA +1122C..1122E ; Mc # [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +11232..11233 ; Mc # [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11235 ; Mc # KHOJKI SIGN VIRAMA +112E0..112E2 ; Mc # [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +11302..11303 ; Mc # [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +1133E..1133F ; Mc # [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11341..11344 ; Mc # [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; Mc # [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134D ; Mc # [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA +11357 ; Mc # GRANTHA AU LENGTH MARK +11362..11363 ; Mc # [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +11435..11437 ; Mc # [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II +11440..11441 ; Mc # [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU +11445 ; Mc # NEWA SIGN VISARGA +114B0..114B2 ; Mc # [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B9 ; Mc # TIRHUTA VOWEL SIGN E +114BB..114BE ; Mc # [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114C1 ; Mc # TIRHUTA SIGN VISARGA +115AF..115B1 ; Mc # [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B8..115BB ; Mc # [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BE ; Mc # SIDDHAM SIGN VISARGA +11630..11632 ; Mc # [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +1163B..1163C ; Mc # [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163E ; Mc # MODI SIGN VISARGA +116AC ; Mc # TAKRI SIGN VISARGA +116AE..116AF ; Mc # [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II +116B6 ; Mc # TAKRI SIGN VIRAMA +11720..11721 ; Mc # [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA +11726 ; Mc # AHOM VOWEL SIGN E +1182C..1182E ; Mc # [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II +11838 ; Mc # DOGRA SIGN VISARGA +119D1..119D3 ; Mc # [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II +119DC..119DF ; Mc # [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA +119E4 ; Mc # NANDINAGARI VOWEL SIGN PRISHTHAMATRA E +11A39 ; Mc # ZANABAZAR SQUARE SIGN VISARGA +11A57..11A58 ; Mc # [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU +11A97 ; Mc # SOYOMBO SIGN VISARGA +11C2F ; Mc # BHAIKSUKI VOWEL SIGN AA +11C3E ; Mc # BHAIKSUKI SIGN VISARGA +11CA9 ; Mc # MARCHEN SUBJOINED LETTER YA +11CB1 ; Mc # MARCHEN VOWEL SIGN I +11CB4 ; Mc # MARCHEN VOWEL SIGN O +11D8A..11D8E ; Mc # [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU +11D93..11D94 ; Mc # [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU +11D96 ; Mc # GUNJALA GONDI SIGN VISARGA +11EF5..11EF6 ; Mc # [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O +16F51..16F87 ; Mc # [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI +1D165..1D166 ; Mc # [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM +1D16D..1D172 ; Mc # [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 + +# Total code points: 429 + +# ================================================ + +# General_Category=Decimal_Number + +0030..0039 ; Nd # [10] DIGIT ZERO..DIGIT NINE +0660..0669 ; Nd # [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE +06F0..06F9 ; Nd # [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE +07C0..07C9 ; Nd # [10] NKO DIGIT ZERO..NKO DIGIT NINE +0966..096F ; Nd # [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE +09E6..09EF ; Nd # [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE +0A66..0A6F ; Nd # [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE +0AE6..0AEF ; Nd # [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE +0B66..0B6F ; Nd # [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE +0BE6..0BEF ; Nd # [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE +0C66..0C6F ; Nd # [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE +0CE6..0CEF ; Nd # [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE +0D66..0D6F ; Nd # [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE +0DE6..0DEF ; Nd # [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE +0E50..0E59 ; Nd # [10] THAI DIGIT ZERO..THAI DIGIT NINE +0ED0..0ED9 ; Nd # [10] LAO DIGIT ZERO..LAO DIGIT NINE +0F20..0F29 ; Nd # [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE +1040..1049 ; Nd # [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE +1090..1099 ; Nd # [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE +17E0..17E9 ; Nd # [10] KHMER DIGIT ZERO..KHMER DIGIT NINE +1810..1819 ; Nd # [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE +1946..194F ; Nd # [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE +19D0..19D9 ; Nd # [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE +1A80..1A89 ; Nd # [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE +1A90..1A99 ; Nd # [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE +1B50..1B59 ; Nd # [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE +1BB0..1BB9 ; Nd # [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE +1C40..1C49 ; Nd # [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE +1C50..1C59 ; Nd # [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE +A620..A629 ; Nd # [10] VAI DIGIT ZERO..VAI DIGIT NINE +A8D0..A8D9 ; Nd # [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE +A900..A909 ; Nd # [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE +A9D0..A9D9 ; Nd # [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE +A9F0..A9F9 ; Nd # [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE +AA50..AA59 ; Nd # [10] CHAM DIGIT ZERO..CHAM DIGIT NINE +ABF0..ABF9 ; Nd # [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE +FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE +104A0..104A9 ; Nd # [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE +10D30..10D39 ; Nd # [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE +11066..1106F ; Nd # [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE +110F0..110F9 ; Nd # [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE +11136..1113F ; Nd # [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE +111D0..111D9 ; Nd # [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE +112F0..112F9 ; Nd # [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE +11450..11459 ; Nd # [10] NEWA DIGIT ZERO..NEWA DIGIT NINE +114D0..114D9 ; Nd # [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE +11650..11659 ; Nd # [10] MODI DIGIT ZERO..MODI DIGIT NINE +116C0..116C9 ; Nd # [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE +11730..11739 ; Nd # [10] AHOM DIGIT ZERO..AHOM DIGIT NINE +118E0..118E9 ; Nd # [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE +11C50..11C59 ; Nd # [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE +11D50..11D59 ; Nd # [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE +11DA0..11DA9 ; Nd # [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE +16A60..16A69 ; Nd # [10] MRO DIGIT ZERO..MRO DIGIT NINE +16B50..16B59 ; Nd # [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE +1D7CE..1D7FF ; Nd # [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE +1E140..1E149 ; Nd # [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE +1E2F0..1E2F9 ; Nd # [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE +1E950..1E959 ; Nd # [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE + +# Total code points: 630 + +# ================================================ + +# General_Category=Letter_Number + +16EE..16F0 ; Nl # [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL +2160..2182 ; Nl # [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND +2185..2188 ; Nl # [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND +3007 ; Nl # IDEOGRAPHIC NUMBER ZERO +3021..3029 ; Nl # [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE +3038..303A ; Nl # [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY +A6E6..A6EF ; Nl # [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM +10140..10174 ; Nl # [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS +10341 ; Nl # GOTHIC LETTER NINETY +1034A ; Nl # GOTHIC LETTER NINE HUNDRED +103D1..103D5 ; Nl # [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED +12400..1246E ; Nl # [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM + +# Total code points: 236 + +# ================================================ + +# General_Category=Other_Number + +00B2..00B3 ; No # [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE +00B9 ; No # SUPERSCRIPT ONE +00BC..00BE ; No # [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS +09F4..09F9 ; No # [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN +0B72..0B77 ; No # [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS +0BF0..0BF2 ; No # [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND +0C78..0C7E ; No # [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR +0D58..0D5E ; No # [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH +0D70..0D78 ; No # [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS +0F2A..0F33 ; No # [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO +1369..137C ; No # [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND +17F0..17F9 ; No # [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON +19DA ; No # NEW TAI LUE THAM DIGIT ONE +2070 ; No # SUPERSCRIPT ZERO +2074..2079 ; No # [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE +2080..2089 ; No # [10] SUBSCRIPT ZERO..SUBSCRIPT NINE +2150..215F ; No # [16] VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE +2189 ; No # VULGAR FRACTION ZERO THIRDS +2460..249B ; No # [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP +24EA..24FF ; No # [22] CIRCLED DIGIT ZERO..NEGATIVE CIRCLED DIGIT ZERO +2776..2793 ; No # [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN +2CFD ; No # COPTIC FRACTION ONE HALF +3192..3195 ; No # [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK +3220..3229 ; No # [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN +3248..324F ; No # [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE +3251..325F ; No # [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE +3280..3289 ; No # [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN +32B1..32BF ; No # [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY +A830..A835 ; No # [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS +10107..10133 ; No # [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND +10175..10178 ; No # [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN +1018A..1018B ; No # [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN +102E1..102FB ; No # [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED +10320..10323 ; No # [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY +10858..1085F ; No # [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND +10879..1087F ; No # [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY +108A7..108AF ; No # [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED +108FB..108FF ; No # [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED +10916..1091B ; No # [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE +109BC..109BD ; No # [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF +109C0..109CF ; No # [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY +109D2..109FF ; No # [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS +10A40..10A48 ; No # [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF +10A7D..10A7E ; No # [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY +10A9D..10A9F ; No # [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY +10AEB..10AEF ; No # [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED +10B58..10B5F ; No # [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND +10B78..10B7F ; No # [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND +10BA9..10BAF ; No # [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED +10CFA..10CFF ; No # [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND +10E60..10E7E ; No # [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS +10F1D..10F26 ; No # [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF +10F51..10F54 ; No # [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED +11052..11065 ; No # [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND +111E1..111F4 ; No # [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND +1173A..1173B ; No # [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY +118EA..118F2 ; No # [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY +11C5A..11C6C ; No # [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK +11FC0..11FD4 ; No # [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH +16B5B..16B61 ; No # [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS +16E80..16E96 ; No # [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM +1D2E0..1D2F3 ; No # [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN +1D360..1D378 ; No # [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE +1E8C7..1E8CF ; No # [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE +1EC71..1ECAB ; No # [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE +1ECAD..1ECAF ; No # [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS +1ECB1..1ECB4 ; No # [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK +1ED01..1ED2D ; No # [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND +1ED2F..1ED3D ; No # [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH +1F100..1F10C ; No # [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO + +# Total code points: 888 + +# ================================================ + +# General_Category=Space_Separator + +0020 ; Zs # SPACE +00A0 ; Zs # NO-BREAK SPACE +1680 ; Zs # OGHAM SPACE MARK +2000..200A ; Zs # [11] EN QUAD..HAIR SPACE +202F ; Zs # NARROW NO-BREAK SPACE +205F ; Zs # MEDIUM MATHEMATICAL SPACE +3000 ; Zs # IDEOGRAPHIC SPACE + +# Total code points: 17 + +# ================================================ + +# General_Category=Line_Separator + +2028 ; Zl # LINE SEPARATOR + +# Total code points: 1 + +# ================================================ + +# General_Category=Paragraph_Separator + +2029 ; Zp # PARAGRAPH SEPARATOR + +# Total code points: 1 + +# ================================================ + +# General_Category=Control + +0000..001F ; Cc # [32] .. +007F..009F ; Cc # [33] .. + +# Total code points: 65 + +# ================================================ + +# General_Category=Format + +00AD ; Cf # SOFT HYPHEN +0600..0605 ; Cf # [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE +061C ; Cf # ARABIC LETTER MARK +06DD ; Cf # ARABIC END OF AYAH +070F ; Cf # SYRIAC ABBREVIATION MARK +08E2 ; Cf # ARABIC DISPUTED END OF AYAH +180E ; Cf # MONGOLIAN VOWEL SEPARATOR +200B..200F ; Cf # [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK +202A..202E ; Cf # [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE +2060..2064 ; Cf # [5] WORD JOINER..INVISIBLE PLUS +2066..206F ; Cf # [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES +FEFF ; Cf # ZERO WIDTH NO-BREAK SPACE +FFF9..FFFB ; Cf # [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR +110BD ; Cf # KAITHI NUMBER SIGN +110CD ; Cf # KAITHI NUMBER SIGN ABOVE +13430..13438 ; Cf # [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT +1BCA0..1BCA3 ; Cf # [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP +1D173..1D17A ; Cf # [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE +E0001 ; Cf # LANGUAGE TAG +E0020..E007F ; Cf # [96] TAG SPACE..CANCEL TAG + +# Total code points: 161 + +# ================================================ + +# General_Category=Private_Use + +E000..F8FF ; Co # [6400] .. +F0000..FFFFD ; Co # [65534] .. +100000..10FFFD; Co # [65534] .. + +# Total code points: 137468 + +# ================================================ + +# General_Category=Surrogate + +D800..DFFF ; Cs # [2048] .. + +# Total code points: 2048 + +# ================================================ + +# General_Category=Dash_Punctuation + +002D ; Pd # HYPHEN-MINUS +058A ; Pd # ARMENIAN HYPHEN +05BE ; Pd # HEBREW PUNCTUATION MAQAF +1400 ; Pd # CANADIAN SYLLABICS HYPHEN +1806 ; Pd # MONGOLIAN TODO SOFT HYPHEN +2010..2015 ; Pd # [6] HYPHEN..HORIZONTAL BAR +2E17 ; Pd # DOUBLE OBLIQUE HYPHEN +2E1A ; Pd # HYPHEN WITH DIAERESIS +2E3A..2E3B ; Pd # [2] TWO-EM DASH..THREE-EM DASH +2E40 ; Pd # DOUBLE HYPHEN +301C ; Pd # WAVE DASH +3030 ; Pd # WAVY DASH +30A0 ; Pd # KATAKANA-HIRAGANA DOUBLE HYPHEN +FE31..FE32 ; Pd # [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH +FE58 ; Pd # SMALL EM DASH +FE63 ; Pd # SMALL HYPHEN-MINUS +FF0D ; Pd # FULLWIDTH HYPHEN-MINUS + +# Total code points: 24 + +# ================================================ + +# General_Category=Open_Punctuation + +0028 ; Ps # LEFT PARENTHESIS +005B ; Ps # LEFT SQUARE BRACKET +007B ; Ps # LEFT CURLY BRACKET +0F3A ; Ps # TIBETAN MARK GUG RTAGS GYON +0F3C ; Ps # TIBETAN MARK ANG KHANG GYON +169B ; Ps # OGHAM FEATHER MARK +201A ; Ps # SINGLE LOW-9 QUOTATION MARK +201E ; Ps # DOUBLE LOW-9 QUOTATION MARK +2045 ; Ps # LEFT SQUARE BRACKET WITH QUILL +207D ; Ps # SUPERSCRIPT LEFT PARENTHESIS +208D ; Ps # SUBSCRIPT LEFT PARENTHESIS +2308 ; Ps # LEFT CEILING +230A ; Ps # LEFT FLOOR +2329 ; Ps # LEFT-POINTING ANGLE BRACKET +2768 ; Ps # MEDIUM LEFT PARENTHESIS ORNAMENT +276A ; Ps # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT +276C ; Ps # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT +276E ; Ps # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT +2770 ; Ps # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT +2772 ; Ps # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT +2774 ; Ps # MEDIUM LEFT CURLY BRACKET ORNAMENT +27C5 ; Ps # LEFT S-SHAPED BAG DELIMITER +27E6 ; Ps # MATHEMATICAL LEFT WHITE SQUARE BRACKET +27E8 ; Ps # MATHEMATICAL LEFT ANGLE BRACKET +27EA ; Ps # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET +27EC ; Ps # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET +27EE ; Ps # MATHEMATICAL LEFT FLATTENED PARENTHESIS +2983 ; Ps # LEFT WHITE CURLY BRACKET +2985 ; Ps # LEFT WHITE PARENTHESIS +2987 ; Ps # Z NOTATION LEFT IMAGE BRACKET +2989 ; Ps # Z NOTATION LEFT BINDING BRACKET +298B ; Ps # LEFT SQUARE BRACKET WITH UNDERBAR +298D ; Ps # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER +298F ; Ps # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2991 ; Ps # LEFT ANGLE BRACKET WITH DOT +2993 ; Ps # LEFT ARC LESS-THAN BRACKET +2995 ; Ps # DOUBLE LEFT ARC GREATER-THAN BRACKET +2997 ; Ps # LEFT BLACK TORTOISE SHELL BRACKET +29D8 ; Ps # LEFT WIGGLY FENCE +29DA ; Ps # LEFT DOUBLE WIGGLY FENCE +29FC ; Ps # LEFT-POINTING CURVED ANGLE BRACKET +2E22 ; Ps # TOP LEFT HALF BRACKET +2E24 ; Ps # BOTTOM LEFT HALF BRACKET +2E26 ; Ps # LEFT SIDEWAYS U BRACKET +2E28 ; Ps # LEFT DOUBLE PARENTHESIS +2E42 ; Ps # DOUBLE LOW-REVERSED-9 QUOTATION MARK +3008 ; Ps # LEFT ANGLE BRACKET +300A ; Ps # LEFT DOUBLE ANGLE BRACKET +300C ; Ps # LEFT CORNER BRACKET +300E ; Ps # LEFT WHITE CORNER BRACKET +3010 ; Ps # LEFT BLACK LENTICULAR BRACKET +3014 ; Ps # LEFT TORTOISE SHELL BRACKET +3016 ; Ps # LEFT WHITE LENTICULAR BRACKET +3018 ; Ps # LEFT WHITE TORTOISE SHELL BRACKET +301A ; Ps # LEFT WHITE SQUARE BRACKET +301D ; Ps # REVERSED DOUBLE PRIME QUOTATION MARK +FD3F ; Ps # ORNATE RIGHT PARENTHESIS +FE17 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET +FE35 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS +FE37 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET +FE39 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET +FE3B ; Ps # PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET +FE3D ; Ps # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET +FE3F ; Ps # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET +FE41 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET +FE43 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET +FE47 ; Ps # PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET +FE59 ; Ps # SMALL LEFT PARENTHESIS +FE5B ; Ps # SMALL LEFT CURLY BRACKET +FE5D ; Ps # SMALL LEFT TORTOISE SHELL BRACKET +FF08 ; Ps # FULLWIDTH LEFT PARENTHESIS +FF3B ; Ps # FULLWIDTH LEFT SQUARE BRACKET +FF5B ; Ps # FULLWIDTH LEFT CURLY BRACKET +FF5F ; Ps # FULLWIDTH LEFT WHITE PARENTHESIS +FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET + +# Total code points: 75 + +# ================================================ + +# General_Category=Close_Punctuation + +0029 ; Pe # RIGHT PARENTHESIS +005D ; Pe # RIGHT SQUARE BRACKET +007D ; Pe # RIGHT CURLY BRACKET +0F3B ; Pe # TIBETAN MARK GUG RTAGS GYAS +0F3D ; Pe # TIBETAN MARK ANG KHANG GYAS +169C ; Pe # OGHAM REVERSED FEATHER MARK +2046 ; Pe # RIGHT SQUARE BRACKET WITH QUILL +207E ; Pe # SUPERSCRIPT RIGHT PARENTHESIS +208E ; Pe # SUBSCRIPT RIGHT PARENTHESIS +2309 ; Pe # RIGHT CEILING +230B ; Pe # RIGHT FLOOR +232A ; Pe # RIGHT-POINTING ANGLE BRACKET +2769 ; Pe # MEDIUM RIGHT PARENTHESIS ORNAMENT +276B ; Pe # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT +276D ; Pe # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT +276F ; Pe # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT +2771 ; Pe # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT +2773 ; Pe # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT +2775 ; Pe # MEDIUM RIGHT CURLY BRACKET ORNAMENT +27C6 ; Pe # RIGHT S-SHAPED BAG DELIMITER +27E7 ; Pe # MATHEMATICAL RIGHT WHITE SQUARE BRACKET +27E9 ; Pe # MATHEMATICAL RIGHT ANGLE BRACKET +27EB ; Pe # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET +27ED ; Pe # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET +27EF ; Pe # MATHEMATICAL RIGHT FLATTENED PARENTHESIS +2984 ; Pe # RIGHT WHITE CURLY BRACKET +2986 ; Pe # RIGHT WHITE PARENTHESIS +2988 ; Pe # Z NOTATION RIGHT IMAGE BRACKET +298A ; Pe # Z NOTATION RIGHT BINDING BRACKET +298C ; Pe # RIGHT SQUARE BRACKET WITH UNDERBAR +298E ; Pe # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2990 ; Pe # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER +2992 ; Pe # RIGHT ANGLE BRACKET WITH DOT +2994 ; Pe # RIGHT ARC GREATER-THAN BRACKET +2996 ; Pe # DOUBLE RIGHT ARC LESS-THAN BRACKET +2998 ; Pe # RIGHT BLACK TORTOISE SHELL BRACKET +29D9 ; Pe # RIGHT WIGGLY FENCE +29DB ; Pe # RIGHT DOUBLE WIGGLY FENCE +29FD ; Pe # RIGHT-POINTING CURVED ANGLE BRACKET +2E23 ; Pe # TOP RIGHT HALF BRACKET +2E25 ; Pe # BOTTOM RIGHT HALF BRACKET +2E27 ; Pe # RIGHT SIDEWAYS U BRACKET +2E29 ; Pe # RIGHT DOUBLE PARENTHESIS +3009 ; Pe # RIGHT ANGLE BRACKET +300B ; Pe # RIGHT DOUBLE ANGLE BRACKET +300D ; Pe # RIGHT CORNER BRACKET +300F ; Pe # RIGHT WHITE CORNER BRACKET +3011 ; Pe # RIGHT BLACK LENTICULAR BRACKET +3015 ; Pe # RIGHT TORTOISE SHELL BRACKET +3017 ; Pe # RIGHT WHITE LENTICULAR BRACKET +3019 ; Pe # RIGHT WHITE TORTOISE SHELL BRACKET +301B ; Pe # RIGHT WHITE SQUARE BRACKET +301E..301F ; Pe # [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK +FD3E ; Pe # ORNATE LEFT PARENTHESIS +FE18 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET +FE36 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS +FE38 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET +FE3A ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET +FE3C ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET +FE3E ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET +FE40 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET +FE42 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET +FE44 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET +FE48 ; Pe # PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET +FE5A ; Pe # SMALL RIGHT PARENTHESIS +FE5C ; Pe # SMALL RIGHT CURLY BRACKET +FE5E ; Pe # SMALL RIGHT TORTOISE SHELL BRACKET +FF09 ; Pe # FULLWIDTH RIGHT PARENTHESIS +FF3D ; Pe # FULLWIDTH RIGHT SQUARE BRACKET +FF5D ; Pe # FULLWIDTH RIGHT CURLY BRACKET +FF60 ; Pe # FULLWIDTH RIGHT WHITE PARENTHESIS +FF63 ; Pe # HALFWIDTH RIGHT CORNER BRACKET + +# Total code points: 73 + +# ================================================ + +# General_Category=Connector_Punctuation + +005F ; Pc # LOW LINE +203F..2040 ; Pc # [2] UNDERTIE..CHARACTER TIE +2054 ; Pc # INVERTED UNDERTIE +FE33..FE34 ; Pc # [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE +FE4D..FE4F ; Pc # [3] DASHED LOW LINE..WAVY LOW LINE +FF3F ; Pc # FULLWIDTH LOW LINE + +# Total code points: 10 + +# ================================================ + +# General_Category=Other_Punctuation + +0021..0023 ; Po # [3] EXCLAMATION MARK..NUMBER SIGN +0025..0027 ; Po # [3] PERCENT SIGN..APOSTROPHE +002A ; Po # ASTERISK +002C ; Po # COMMA +002E..002F ; Po # [2] FULL STOP..SOLIDUS +003A..003B ; Po # [2] COLON..SEMICOLON +003F..0040 ; Po # [2] QUESTION MARK..COMMERCIAL AT +005C ; Po # REVERSE SOLIDUS +00A1 ; Po # INVERTED EXCLAMATION MARK +00A7 ; Po # SECTION SIGN +00B6..00B7 ; Po # [2] PILCROW SIGN..MIDDLE DOT +00BF ; Po # INVERTED QUESTION MARK +037E ; Po # GREEK QUESTION MARK +0387 ; Po # GREEK ANO TELEIA +055A..055F ; Po # [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK +0589 ; Po # ARMENIAN FULL STOP +05C0 ; Po # HEBREW PUNCTUATION PASEQ +05C3 ; Po # HEBREW PUNCTUATION SOF PASUQ +05C6 ; Po # HEBREW PUNCTUATION NUN HAFUKHA +05F3..05F4 ; Po # [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM +0609..060A ; Po # [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN +060C..060D ; Po # [2] ARABIC COMMA..ARABIC DATE SEPARATOR +061B ; Po # ARABIC SEMICOLON +061E..061F ; Po # [2] ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK +066A..066D ; Po # [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR +06D4 ; Po # ARABIC FULL STOP +0700..070D ; Po # [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS +07F7..07F9 ; Po # [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK +0830..083E ; Po # [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU +085E ; Po # MANDAIC PUNCTUATION +0964..0965 ; Po # [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA +0970 ; Po # DEVANAGARI ABBREVIATION SIGN +09FD ; Po # BENGALI ABBREVIATION SIGN +0A76 ; Po # GURMUKHI ABBREVIATION SIGN +0AF0 ; Po # GUJARATI ABBREVIATION SIGN +0C77 ; Po # TELUGU SIGN SIDDHAM +0C84 ; Po # KANNADA SIGN SIDDHAM +0DF4 ; Po # SINHALA PUNCTUATION KUNDDALIYA +0E4F ; Po # THAI CHARACTER FONGMAN +0E5A..0E5B ; Po # [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT +0F04..0F12 ; Po # [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD +0F14 ; Po # TIBETAN MARK GTER TSHEG +0F85 ; Po # TIBETAN MARK PALUTA +0FD0..0FD4 ; Po # [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA +0FD9..0FDA ; Po # [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS +104A..104F ; Po # [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE +10FB ; Po # GEORGIAN PARAGRAPH SEPARATOR +1360..1368 ; Po # [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR +166E ; Po # CANADIAN SYLLABICS FULL STOP +16EB..16ED ; Po # [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION +1735..1736 ; Po # [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION +17D4..17D6 ; Po # [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH +17D8..17DA ; Po # [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT +1800..1805 ; Po # [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS +1807..180A ; Po # [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU +1944..1945 ; Po # [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK +1A1E..1A1F ; Po # [2] BUGINESE PALLAWA..BUGINESE END OF SECTION +1AA0..1AA6 ; Po # [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA +1AA8..1AAD ; Po # [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG +1B5A..1B60 ; Po # [7] BALINESE PANTI..BALINESE PAMENENG +1BFC..1BFF ; Po # [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT +1C3B..1C3F ; Po # [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK +1C7E..1C7F ; Po # [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD +1CC0..1CC7 ; Po # [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA +1CD3 ; Po # VEDIC SIGN NIHSHVASA +2016..2017 ; Po # [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE +2020..2027 ; Po # [8] DAGGER..HYPHENATION POINT +2030..2038 ; Po # [9] PER MILLE SIGN..CARET +203B..203E ; Po # [4] REFERENCE MARK..OVERLINE +2041..2043 ; Po # [3] CARET INSERTION POINT..HYPHEN BULLET +2047..2051 ; Po # [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY +2053 ; Po # SWUNG DASH +2055..205E ; Po # [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS +2CF9..2CFC ; Po # [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER +2CFE..2CFF ; Po # [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER +2D70 ; Po # TIFINAGH SEPARATOR MARK +2E00..2E01 ; Po # [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER +2E06..2E08 ; Po # [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER +2E0B ; Po # RAISED SQUARE +2E0E..2E16 ; Po # [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE +2E18..2E19 ; Po # [2] INVERTED INTERROBANG..PALM BRANCH +2E1B ; Po # TILDE WITH RING ABOVE +2E1E..2E1F ; Po # [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW +2E2A..2E2E ; Po # [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK +2E30..2E39 ; Po # [10] RING POINT..TOP HALF SECTION SIGN +2E3C..2E3F ; Po # [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E41 ; Po # REVERSED COMMA +2E43..2E4F ; Po # [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER +3001..3003 ; Po # [3] IDEOGRAPHIC COMMA..DITTO MARK +303D ; Po # PART ALTERNATION MARK +30FB ; Po # KATAKANA MIDDLE DOT +A4FE..A4FF ; Po # [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP +A60D..A60F ; Po # [3] VAI COMMA..VAI QUESTION MARK +A673 ; Po # SLAVONIC ASTERISK +A67E ; Po # CYRILLIC KAVYKA +A6F2..A6F7 ; Po # [6] BAMUM NJAEMLI..BAMUM QUESTION MARK +A874..A877 ; Po # [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD +A8CE..A8CF ; Po # [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA +A8F8..A8FA ; Po # [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET +A8FC ; Po # DEVANAGARI SIGN SIDDHAM +A92E..A92F ; Po # [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA +A95F ; Po # REJANG SECTION MARK +A9C1..A9CD ; Po # [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH +A9DE..A9DF ; Po # [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN +AA5C..AA5F ; Po # [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA +AADE..AADF ; Po # [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI +AAF0..AAF1 ; Po # [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM +ABEB ; Po # MEETEI MAYEK CHEIKHEI +FE10..FE16 ; Po # [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK +FE19 ; Po # PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS +FE30 ; Po # PRESENTATION FORM FOR VERTICAL TWO DOT LEADER +FE45..FE46 ; Po # [2] SESAME DOT..WHITE SESAME DOT +FE49..FE4C ; Po # [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE +FE50..FE52 ; Po # [3] SMALL COMMA..SMALL FULL STOP +FE54..FE57 ; Po # [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK +FE5F..FE61 ; Po # [3] SMALL NUMBER SIGN..SMALL ASTERISK +FE68 ; Po # SMALL REVERSE SOLIDUS +FE6A..FE6B ; Po # [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT +FF01..FF03 ; Po # [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN +FF05..FF07 ; Po # [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE +FF0A ; Po # FULLWIDTH ASTERISK +FF0C ; Po # FULLWIDTH COMMA +FF0E..FF0F ; Po # [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS +FF1A..FF1B ; Po # [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON +FF1F..FF20 ; Po # [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT +FF3C ; Po # FULLWIDTH REVERSE SOLIDUS +FF61 ; Po # HALFWIDTH IDEOGRAPHIC FULL STOP +FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT +10100..10102 ; Po # [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK +1039F ; Po # UGARITIC WORD DIVIDER +103D0 ; Po # OLD PERSIAN WORD DIVIDER +1056F ; Po # CAUCASIAN ALBANIAN CITATION MARK +10857 ; Po # IMPERIAL ARAMAIC SECTION SIGN +1091F ; Po # PHOENICIAN WORD SEPARATOR +1093F ; Po # LYDIAN TRIANGULAR MARK +10A50..10A58 ; Po # [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES +10A7F ; Po # OLD SOUTH ARABIAN NUMERIC INDICATOR +10AF0..10AF6 ; Po # [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER +10B39..10B3F ; Po # [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION +10B99..10B9C ; Po # [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT +10F55..10F59 ; Po # [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT +11047..1104D ; Po # [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS +110BB..110BC ; Po # [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN +110BE..110C1 ; Po # [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA +11140..11143 ; Po # [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK +11174..11175 ; Po # [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK +111C5..111C8 ; Po # [4] SHARADA DANDA..SHARADA SEPARATOR +111CD ; Po # SHARADA SUTRA MARK +111DB ; Po # SHARADA SIGN SIDDHAM +111DD..111DF ; Po # [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 +11238..1123D ; Po # [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN +112A9 ; Po # MULTANI SECTION MARK +1144B..1144F ; Po # [5] NEWA DANDA..NEWA ABBREVIATION SIGN +1145B ; Po # NEWA PLACEHOLDER MARK +1145D ; Po # NEWA INSERTION SIGN +114C6 ; Po # TIRHUTA ABBREVIATION SIGN +115C1..115D7 ; Po # [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES +11641..11643 ; Po # [3] MODI DANDA..MODI ABBREVIATION SIGN +11660..1166C ; Po # [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT +1173C..1173E ; Po # [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI +1183B ; Po # DOGRA ABBREVIATION SIGN +119E2 ; Po # NANDINAGARI SIGN SIDDHAM +11A3F..11A46 ; Po # [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK +11A9A..11A9C ; Po # [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD +11A9E..11AA2 ; Po # [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 +11C41..11C45 ; Po # [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 +11C70..11C71 ; Po # [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD +11EF7..11EF8 ; Po # [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION +11FFF ; Po # TAMIL PUNCTUATION END OF TEXT +12470..12474 ; Po # [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON +16A6E..16A6F ; Po # [2] MRO DANDA..MRO DOUBLE DANDA +16AF5 ; Po # BASSA VAH FULL STOP +16B37..16B3B ; Po # [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM +16B44 ; Po # PAHAWH HMONG SIGN XAUS +16E97..16E9A ; Po # [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH +16FE2 ; Po # OLD CHINESE HOOK MARK +1BC9F ; Po # DUPLOYAN PUNCTUATION CHINOOK FULL STOP +1DA87..1DA8B ; Po # [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS +1E95E..1E95F ; Po # [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK + +# Total code points: 588 + +# ================================================ + +# General_Category=Math_Symbol + +002B ; Sm # PLUS SIGN +003C..003E ; Sm # [3] LESS-THAN SIGN..GREATER-THAN SIGN +007C ; Sm # VERTICAL LINE +007E ; Sm # TILDE +00AC ; Sm # NOT SIGN +00B1 ; Sm # PLUS-MINUS SIGN +00D7 ; Sm # MULTIPLICATION SIGN +00F7 ; Sm # DIVISION SIGN +03F6 ; Sm # GREEK REVERSED LUNATE EPSILON SYMBOL +0606..0608 ; Sm # [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY +2044 ; Sm # FRACTION SLASH +2052 ; Sm # COMMERCIAL MINUS SIGN +207A..207C ; Sm # [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN +208A..208C ; Sm # [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN +2118 ; Sm # SCRIPT CAPITAL P +2140..2144 ; Sm # [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y +214B ; Sm # TURNED AMPERSAND +2190..2194 ; Sm # [5] LEFTWARDS ARROW..LEFT RIGHT ARROW +219A..219B ; Sm # [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE +21A0 ; Sm # RIGHTWARDS TWO HEADED ARROW +21A3 ; Sm # RIGHTWARDS ARROW WITH TAIL +21A6 ; Sm # RIGHTWARDS ARROW FROM BAR +21AE ; Sm # LEFT RIGHT ARROW WITH STROKE +21CE..21CF ; Sm # [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE +21D2 ; Sm # RIGHTWARDS DOUBLE ARROW +21D4 ; Sm # LEFT RIGHT DOUBLE ARROW +21F4..22FF ; Sm # [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP +2320..2321 ; Sm # [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL +237C ; Sm # RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW +239B..23B3 ; Sm # [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM +23DC..23E1 ; Sm # [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET +25B7 ; Sm # WHITE RIGHT-POINTING TRIANGLE +25C1 ; Sm # WHITE LEFT-POINTING TRIANGLE +25F8..25FF ; Sm # [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE +266F ; Sm # MUSIC SHARP SIGN +27C0..27C4 ; Sm # [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET +27C7..27E5 ; Sm # [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK +27F0..27FF ; Sm # [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW +2900..2982 ; Sm # [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON +2999..29D7 ; Sm # [63] DOTTED FENCE..BLACK HOURGLASS +29DC..29FB ; Sm # [32] INCOMPLETE INFINITY..TRIPLE PLUS +29FE..2AFF ; Sm # [258] TINY..N-ARY WHITE VERTICAL BAR +2B30..2B44 ; Sm # [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET +2B47..2B4C ; Sm # [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR +FB29 ; Sm # HEBREW LETTER ALTERNATIVE PLUS SIGN +FE62 ; Sm # SMALL PLUS SIGN +FE64..FE66 ; Sm # [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN +FF0B ; Sm # FULLWIDTH PLUS SIGN +FF1C..FF1E ; Sm # [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN +FF5C ; Sm # FULLWIDTH VERTICAL LINE +FF5E ; Sm # FULLWIDTH TILDE +FFE2 ; Sm # FULLWIDTH NOT SIGN +FFE9..FFEC ; Sm # [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW +1D6C1 ; Sm # MATHEMATICAL BOLD NABLA +1D6DB ; Sm # MATHEMATICAL BOLD PARTIAL DIFFERENTIAL +1D6FB ; Sm # MATHEMATICAL ITALIC NABLA +1D715 ; Sm # MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL +1D735 ; Sm # MATHEMATICAL BOLD ITALIC NABLA +1D74F ; Sm # MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL +1D76F ; Sm # MATHEMATICAL SANS-SERIF BOLD NABLA +1D789 ; Sm # MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL +1D7A9 ; Sm # MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA +1D7C3 ; Sm # MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL +1EEF0..1EEF1 ; Sm # [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL + +# Total code points: 948 + +# ================================================ + +# General_Category=Currency_Symbol + +0024 ; Sc # DOLLAR SIGN +00A2..00A5 ; Sc # [4] CENT SIGN..YEN SIGN +058F ; Sc # ARMENIAN DRAM SIGN +060B ; Sc # AFGHANI SIGN +07FE..07FF ; Sc # [2] NKO DOROME SIGN..NKO TAMAN SIGN +09F2..09F3 ; Sc # [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN +09FB ; Sc # BENGALI GANDA MARK +0AF1 ; Sc # GUJARATI RUPEE SIGN +0BF9 ; Sc # TAMIL RUPEE SIGN +0E3F ; Sc # THAI CURRENCY SYMBOL BAHT +17DB ; Sc # KHMER CURRENCY SYMBOL RIEL +20A0..20BF ; Sc # [32] EURO-CURRENCY SIGN..BITCOIN SIGN +A838 ; Sc # NORTH INDIC RUPEE MARK +FDFC ; Sc # RIAL SIGN +FE69 ; Sc # SMALL DOLLAR SIGN +FF04 ; Sc # FULLWIDTH DOLLAR SIGN +FFE0..FFE1 ; Sc # [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN +FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN +11FDD..11FE0 ; Sc # [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN +1E2FF ; Sc # WANCHO NGUN SIGN +1ECB0 ; Sc # INDIC SIYAQ RUPEE MARK + +# Total code points: 62 + +# ================================================ + +# General_Category=Modifier_Symbol + +005E ; Sk # CIRCUMFLEX ACCENT +0060 ; Sk # GRAVE ACCENT +00A8 ; Sk # DIAERESIS +00AF ; Sk # MACRON +00B4 ; Sk # ACUTE ACCENT +00B8 ; Sk # CEDILLA +02C2..02C5 ; Sk # [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD +02D2..02DF ; Sk # [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT +02E5..02EB ; Sk # [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK +02ED ; Sk # MODIFIER LETTER UNASPIRATED +02EF..02FF ; Sk # [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW +0375 ; Sk # GREEK LOWER NUMERAL SIGN +0384..0385 ; Sk # [2] GREEK TONOS..GREEK DIALYTIKA TONOS +1FBD ; Sk # GREEK KORONIS +1FBF..1FC1 ; Sk # [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI +1FCD..1FCF ; Sk # [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI +1FDD..1FDF ; Sk # [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI +1FED..1FEF ; Sk # [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA +1FFD..1FFE ; Sk # [2] GREEK OXIA..GREEK DASIA +309B..309C ; Sk # [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +A700..A716 ; Sk # [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR +A720..A721 ; Sk # [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE +A789..A78A ; Sk # [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN +AB5B ; Sk # MODIFIER BREVE WITH INVERTED BREVE +FBB2..FBC1 ; Sk # [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW +FF3E ; Sk # FULLWIDTH CIRCUMFLEX ACCENT +FF40 ; Sk # FULLWIDTH GRAVE ACCENT +FFE3 ; Sk # FULLWIDTH MACRON +1F3FB..1F3FF ; Sk # [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 + +# Total code points: 121 + +# ================================================ + +# General_Category=Other_Symbol + +00A6 ; So # BROKEN BAR +00A9 ; So # COPYRIGHT SIGN +00AE ; So # REGISTERED SIGN +00B0 ; So # DEGREE SIGN +0482 ; So # CYRILLIC THOUSANDS SIGN +058D..058E ; So # [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN +060E..060F ; So # [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA +06DE ; So # ARABIC START OF RUB EL HIZB +06E9 ; So # ARABIC PLACE OF SAJDAH +06FD..06FE ; So # [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN +07F6 ; So # NKO SYMBOL OO DENNEN +09FA ; So # BENGALI ISSHAR +0B70 ; So # ORIYA ISSHAR +0BF3..0BF8 ; So # [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN +0BFA ; So # TAMIL NUMBER SIGN +0C7F ; So # TELUGU SIGN TUUMU +0D4F ; So # MALAYALAM SIGN PARA +0D79 ; So # MALAYALAM DATE MARK +0F01..0F03 ; So # [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA +0F13 ; So # TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN +0F15..0F17 ; So # [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS +0F1A..0F1F ; So # [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG +0F34 ; So # TIBETAN MARK BSDUS RTAGS +0F36 ; So # TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN +0F38 ; So # TIBETAN MARK CHE MGO +0FBE..0FC5 ; So # [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE +0FC7..0FCC ; So # [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL +0FCE..0FCF ; So # [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM +0FD5..0FD8 ; So # [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS +109E..109F ; So # [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION +1390..1399 ; So # [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT +166D ; So # CANADIAN SYLLABICS CHI SIGN +1940 ; So # LIMBU SIGN LOO +19DE..19FF ; So # [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC +1B61..1B6A ; So # [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE +1B74..1B7C ; So # [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING +2100..2101 ; So # [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT +2103..2106 ; So # [4] DEGREE CELSIUS..CADA UNA +2108..2109 ; So # [2] SCRUPLE..DEGREE FAHRENHEIT +2114 ; So # L B BAR SYMBOL +2116..2117 ; So # [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT +211E..2123 ; So # [6] PRESCRIPTION TAKE..VERSICLE +2125 ; So # OUNCE SIGN +2127 ; So # INVERTED OHM SIGN +2129 ; So # TURNED GREEK SMALL LETTER IOTA +212E ; So # ESTIMATED SYMBOL +213A..213B ; So # [2] ROTATED CAPITAL Q..FACSIMILE SIGN +214A ; So # PROPERTY LINE +214C..214D ; So # [2] PER SIGN..AKTIESELSKAB +214F ; So # SYMBOL FOR SAMARITAN SOURCE +218A..218B ; So # [2] TURNED DIGIT TWO..TURNED DIGIT THREE +2195..2199 ; So # [5] UP DOWN ARROW..SOUTH WEST ARROW +219C..219F ; So # [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW +21A1..21A2 ; So # [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL +21A4..21A5 ; So # [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR +21A7..21AD ; So # [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW +21AF..21CD ; So # [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE +21D0..21D1 ; So # [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW +21D3 ; So # DOWNWARDS DOUBLE ARROW +21D5..21F3 ; So # [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW +2300..2307 ; So # [8] DIAMETER SIGN..WAVY LINE +230C..231F ; So # [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER +2322..2328 ; So # [7] FROWN..KEYBOARD +232B..237B ; So # [81] ERASE TO THE LEFT..NOT CHECK MARK +237D..239A ; So # [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL +23B4..23DB ; So # [40] TOP SQUARE BRACKET..FUSE +23E2..2426 ; So # [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO +2440..244A ; So # [11] OCR HOOK..OCR DOUBLE BACKSLASH +249C..24E9 ; So # [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z +2500..25B6 ; So # [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE +25B8..25C0 ; So # [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE +25C2..25F7 ; So # [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT +2600..266E ; So # [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN +2670..2767 ; So # [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET +2794..27BF ; So # [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP +2800..28FF ; So # [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 +2B00..2B2F ; So # [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE +2B45..2B46 ; So # [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW +2B4D..2B73 ; So # [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B76..2B95 ; So # [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B98..2BFF ; So # [104] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..HELLSCHREIBER PAUSE SYMBOL +2CE5..2CEA ; So # [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA +2E80..2E99 ; So # [26] CJK RADICAL REPEAT..CJK RADICAL RAP +2E9B..2EF3 ; So # [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE +2F00..2FD5 ; So # [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE +2FF0..2FFB ; So # [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID +3004 ; So # JAPANESE INDUSTRIAL STANDARD SYMBOL +3012..3013 ; So # [2] POSTAL MARK..GETA MARK +3020 ; So # POSTAL MARK FACE +3036..3037 ; So # [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL +303E..303F ; So # [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE +3190..3191 ; So # [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK +3196..319F ; So # [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK +31C0..31E3 ; So # [36] CJK STROKE T..CJK STROKE Q +3200..321E ; So # [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU +322A..3247 ; So # [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO +3250 ; So # PARTNERSHIP SIGN +3260..327F ; So # [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL +328A..32B0 ; So # [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT +32C0..33FF ; So # [320] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE GAL +4DC0..4DFF ; So # [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION +A490..A4C6 ; So # [55] YI RADICAL QOT..YI RADICAL KE +A828..A82B ; So # [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4 +A836..A837 ; So # [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK +A839 ; So # NORTH INDIC QUANTITY MARK +AA77..AA79 ; So # [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO +FDFD ; So # ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM +FFE4 ; So # FULLWIDTH BROKEN BAR +FFE8 ; So # HALFWIDTH FORMS LIGHT VERTICAL +FFED..FFEE ; So # [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE +FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER +10137..1013F ; So # [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT +10179..10189 ; So # [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN +1018C..1018E ; So # [3] GREEK SINUSOID SIGN..NOMISMA SIGN +10190..1019B ; So # [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN +101A0 ; So # GREEK SYMBOL TAU RHO +101D0..101FC ; So # [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND +10877..10878 ; So # [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON +10AC8 ; So # MANICHAEAN SIGN UD +1173F ; So # AHOM SYMBOL VI +11FD5..11FDC ; So # [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI +11FE1..11FF1 ; So # [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA +16B3C..16B3F ; So # [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB +16B45 ; So # PAHAWH HMONG SIGN CIM TSOV ROG +1BC9C ; So # DUPLOYAN SIGN O WITH CROSS +1D000..1D0F5 ; So # [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO +1D100..1D126 ; So # [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 +1D129..1D164 ; So # [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE +1D16A..1D16C ; So # [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3 +1D183..1D184 ; So # [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN +1D18C..1D1A9 ; So # [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH +1D1AE..1D1E8 ; So # [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN +1D200..1D241 ; So # [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 +1D245 ; So # GREEK MUSICAL LEIMMA +1D300..1D356 ; So # [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING +1D800..1D9FF ; So # [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD +1DA37..1DA3A ; So # [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE +1DA6D..1DA74 ; So # [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING +1DA76..1DA83 ; So # [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH +1DA85..1DA86 ; So # [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS +1E14F ; So # NYIAKENG PUACHUE HMONG CIRCLED CA +1ECAC ; So # INDIC SIYAQ PLACEHOLDER +1ED2E ; So # OTTOMAN SIYAQ MARRATAN +1F000..1F02B ; So # [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK +1F030..1F093 ; So # [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 +1F0A0..1F0AE ; So # [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES +1F0B1..1F0BF ; So # [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER +1F0C1..1F0CF ; So # [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER +1F0D1..1F0F5 ; So # [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 +1F110..1F16C ; So # [93] PARENTHESIZED LATIN CAPITAL LETTER A..RAISED MR SIGN +1F170..1F1AC ; So # [61] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD +1F1E6..1F202 ; So # [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA +1F210..1F23B ; So # [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D +1F240..1F248 ; So # [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 +1F250..1F251 ; So # [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT +1F260..1F265 ; So # [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI +1F300..1F3FA ; So # [251] CYCLONE..AMPHORA +1F400..1F6D5 ; So # [726] RAT..HINDU TEMPLE +1F6E0..1F6EC ; So # [13] HAMMER AND WRENCH..AIRPLANE ARRIVING +1F6F0..1F6FA ; So # [11] SATELLITE..AUTO RICKSHAW +1F700..1F773 ; So # [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE +1F780..1F7D8 ; So # [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE +1F7E0..1F7EB ; So # [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE +1F800..1F80B ; So # [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD +1F810..1F847 ; So # [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW +1F850..1F859 ; So # [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW +1F860..1F887 ; So # [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW +1F890..1F8AD ; So # [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS +1F900..1F90B ; So # [12] CIRCLED CROSS FORMEE WITH FOUR DOTS..DOWNWARD FACING NOTCHED HOOK WITH DOT +1F90D..1F971 ; So # [101] WHITE HEART..YAWNING FACE +1F973..1F976 ; So # [4] FACE WITH PARTY HORN AND PARTY HAT..FREEZING FACE +1F97A..1F9A2 ; So # [41] FACE WITH PLEADING EYES..SWAN +1F9A5..1F9AA ; So # [6] SLOTH..OYSTER +1F9AE..1F9CA ; So # [29] GUIDE DOG..ICE CUBE +1F9CD..1FA53 ; So # [135] STANDING PERSON..BLACK CHESS KNIGHT-BISHOP +1FA60..1FA6D ; So # [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER +1FA70..1FA73 ; So # [4] BALLET SHOES..SHORTS +1FA78..1FA7A ; So # [3] DROP OF BLOOD..STETHOSCOPE +1FA80..1FA82 ; So # [3] YO-YO..PARACHUTE +1FA90..1FA95 ; So # [6] RINGED PLANET..BANJO + +# Total code points: 6161 + +# ================================================ + +# General_Category=Initial_Punctuation + +00AB ; Pi # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +2018 ; Pi # LEFT SINGLE QUOTATION MARK +201B..201C ; Pi # [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK +201F ; Pi # DOUBLE HIGH-REVERSED-9 QUOTATION MARK +2039 ; Pi # SINGLE LEFT-POINTING ANGLE QUOTATION MARK +2E02 ; Pi # LEFT SUBSTITUTION BRACKET +2E04 ; Pi # LEFT DOTTED SUBSTITUTION BRACKET +2E09 ; Pi # LEFT TRANSPOSITION BRACKET +2E0C ; Pi # LEFT RAISED OMISSION BRACKET +2E1C ; Pi # LEFT LOW PARAPHRASE BRACKET +2E20 ; Pi # LEFT VERTICAL BAR WITH QUILL + +# Total code points: 12 + +# ================================================ + +# General_Category=Final_Punctuation + +00BB ; Pf # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +2019 ; Pf # RIGHT SINGLE QUOTATION MARK +201D ; Pf # RIGHT DOUBLE QUOTATION MARK +203A ; Pf # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +2E03 ; Pf # RIGHT SUBSTITUTION BRACKET +2E05 ; Pf # RIGHT DOTTED SUBSTITUTION BRACKET +2E0A ; Pf # RIGHT TRANSPOSITION BRACKET +2E0D ; Pf # RIGHT RAISED OMISSION BRACKET +2E1D ; Pf # RIGHT LOW PARAPHRASE BRACKET +2E21 ; Pf # RIGHT VERTICAL BAR WITH QUILL + +# Total code points: 10 + +# EOF diff --git a/test/LICENSE b/test/LICENSE new file mode 100644 index 0000000..69da849 --- /dev/null +++ b/test/LICENSE @@ -0,0 +1,64 @@ +The CommonMark spec (spec.txt) and DTD (CommonMark.dtd) are + +Copyright (C) 2014-16 John MacFarlane + +Released under the Creative Commons CC-BY-SA 4.0 license: +. + +--- + +The test software in test/ and the programs in tools/ are + +Copyright (c) 2014, John MacFarlane + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The normalization code in runtests.py was derived from the +markdowntest project, Copyright 2013 Karl Dubost: + +The MIT License (MIT) + +Copyright (c) 2013 Karl Dubost + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/test/cmark.py b/test/cmark.py new file mode 100755 index 0000000..1110860 --- /dev/null +++ b/test/cmark.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from ctypes import CDLL, c_char_p, c_long +from subprocess import * +import platform +import os + +def pipe_through_prog(prog, text): + p1 = Popen(prog.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) + [result, err] = p1.communicate(input=text.encode('utf-8')) + return [p1.returncode, result.decode('utf-8'), err] + +def use_library(lib, text): + textbytes = text.encode('utf-8') + textlen = len(textbytes) + return [0, lib(textbytes, textlen, 0).decode('utf-8'), ''] + +class CMark: + def __init__(self, prog=None, library_dir=None): + self.prog = prog + if prog: + self.to_html = lambda x: pipe_through_prog(prog, x) + else: + sysname = platform.system() + if sysname == 'Darwin': + libname = "libcmark.dylib" + elif sysname == 'Windows': + libname = "cmark.dll" + else: + libname = "libcmark.so" + if library_dir: + libpath = os.path.join(library_dir, libname) + else: + libpath = os.path.join("build", "src", libname) + cmark = CDLL(libpath) + markdown = cmark.cmark_markdown_to_html + markdown.restype = c_char_p + markdown.argtypes = [c_char_p, c_long] + self.to_html = lambda x: use_library(markdown, x) diff --git a/test/coverage.txt b/test/coverage.txt new file mode 100644 index 0000000..cffdf63 --- /dev/null +++ b/test/coverage.txt @@ -0,0 +1,464 @@ + +# Coverage + +This file is just a collection of unit tests not covered elsewhere. + +Most notably regression tests, tests improving code coverage and other useful +things may drop here. + +(However any tests requiring any additional command line option, like enabling +an extension, must be included in their respective files.) + + +## GitHub Issues + +### [Issue 2](https://github.com/mity/md4c/issues/2) + +Raw HTML block: + +```````````````````````````````` example + +. + +```````````````````````````````` + +Inline: + +```````````````````````````````` example +foo bar +. +

        foo bar

        +```````````````````````````````` + +Inline with a line break: + +```````````````````````````````` example +foo bar +. +

        foo bar

        +```````````````````````````````` + + +### [Issue 4](https://github.com/mity/md4c/issues/4) + +```````````````````````````````` example +![alt text with *entity* ©](img.png 'title') +. +

        alt text with entity ©

        +```````````````````````````````` + + +### [Issue 9](https://github.com/mity/md4c/issues/9) + +```````````````````````````````` example +> [foo +> bar]: /url +> +> [foo bar] +. +
        +

        foo +bar

        +
        +```````````````````````````````` + + +### [Issue 10](https://github.com/mity/md4c/issues/10) + +```````````````````````````````` example +[x]: +x +- +
      • +
      +```````````````````````````````` + + +### [Issue 11](https://github.com/mity/md4c/issues/11) + +```````````````````````````````` example +x [link](/url "foo – bar") x +. +

      x link x

      +```````````````````````````````` + + +### [Issue 14](https://github.com/mity/md4c/issues/14) + +```````````````````````````````` example +a***b* c* +. +

      a*b c

      +```````````````````````````````` + + +### [Issue 15](https://github.com/mity/md4c/issues/15) + +```````````````````````````````` example +***b* c* +. +

      *b c

      +```````````````````````````````` + + +### [Issue 21](https://github.com/mity/md4c/issues/21) + +```````````````````````````````` example +a*b**c* +. +

      ab**c

      +```````````````````````````````` + + +### [Issue 33](https://github.com/mity/md4c/issues/33) + +```````````````````````````````` example +```&&&&&&&& +. +
      +```````````````````````````````` + + +### [Issue 36](https://github.com/mity/md4c/issues/36) + +```````````````````````````````` example +__x_ _x___ +. +

      x x_

      +```````````````````````````````` + + +### [Issue 39](https://github.com/mity/md4c/issues/39) + +```````````````````````````````` example +[\\]: x +. +```````````````````````````````` + + +### [Issue 40](https://github.com/mity/md4c/issues/40) + +```````````````````````````````` example +[x](url +'title' +)x +. +

      xx

      +```````````````````````````````` + + +### [Issue 65](https://github.com/mity/md4c/issues/65) + +```````````````````````````````` example +` +. +

      `

      +```````````````````````````````` + + +### [Issue 74](https://github.com/mity/md4c/issues/74) + +```````````````````````````````` example +[f]: +- + xx +- +. +
      xx
      +
      +
        +
      • +
      +```````````````````````````````` + + +### [Issue 78](https://github.com/mity/md4c/issues/78) + +```````````````````````````````` example +[SS ẞ]: /url +[ẞ SS] +. +

      ẞ SS

      +```````````````````````````````` + + +### [Issue 83](https://github.com/mity/md4c/issues/83) + +```````````````````````````````` example +foo +> +. +

      foo

      +
      +
      + +```````````````````````````````` + + +### [Issue 95](https://github.com/mity/md4c/issues/95) + +```````````````````````````````` example +. foo +. +

      . foo

      +```````````````````````````````` + + +### [Issue 96](https://github.com/mity/md4c/issues/96) + +```````````````````````````````` example +[ab]: /foo +[a] [ab] [abc] +. +

      [a] ab [abc]

      +```````````````````````````````` + +```````````````````````````````` example +[a b]: /foo +[a b] +. +

      a b

      +```````````````````````````````` + + +### [Issue 97](https://github.com/mity/md4c/issues/97) + +```````````````````````````````` example +*a **b c* d** +. +

      a b c d

      + +```````````````````````````````` + + +### [Issue 100](https://github.com/mity/md4c/issues/100) + +```````````````````````````````` example + +. +

      foo@123456789012345678901234567890123456789012345678901234567890123.123456789012345678901234567890123456789012345678901234567890123

      +```````````````````````````````` + +```````````````````````````````` example + +. +

      <foo@123456789012345678901234567890123456789012345678901234567890123x.123456789012345678901234567890123456789012345678901234567890123>

      +```````````````````````````````` +(Note the `x` here which turns it over the max. allowed length limit.) + + +### [Issue 107](https://github.com/mity/md4c/issues/107) + +```````````````````````````````` example +***foo *bar baz*** +. +

      *foo bar baz

      + +```````````````````````````````` + + +## Code coverage + +### `md_is_unicode_whitespace__()` + +Unicode whitespace (here U+2000) forms a word boundary so these cannot be +resolved as emphasis span because there is no closer mark. + +```````````````````````````````` example +*foo *bar +. +

      *foo *bar

      +```````````````````````````````` + + +### `md_is_unicode_punct__()` + +Ditto for Unicode punctuation (here U+00A1). + +```````````````````````````````` example +*foo¡*bar +. +

      *foo¡*bar

      +```````````````````````````````` + + +### `md_get_unicode_fold_info()` + +```````````````````````````````` example +[Příliš žluťoučký kůň úpěl ďábelské ódy.] + +[PŘÍLIŠ ŽLUŤOUČKÝ KŮŇ ÚPĚL ĎÁBELSKÉ ÓDY.]: /url +. +

      Příliš žluťoučký kůň úpěl ďábelské ódy.

      +```````````````````````````````` + + +### `md_decode_utf8__()` and `md_decode_utf8_before__()` + +```````````````````````````````` example +á*Á (U+00E1, i.e. two byte UTF-8 sequence) + *  (U+2000, i.e. three byte UTF-8 sequence) +. +

      á*Á (U+00E1, i.e. two byte UTF-8 sequence) + * (U+2000, i.e. three byte UTF-8 sequence)

      +```````````````````````````````` + + +### `md_is_link_destination_A()` + +```````````````````````````````` example +[link]() +. +

      link

      +```````````````````````````````` + + +### `md_link_label_eq()` + +```````````````````````````````` example +[foo bar] + +[foo bar]: /url +. +

      foo bar

      +```````````````````````````````` + + +### `md_is_inline_link_spec()` + +```````````````````````````````` example +> [link](/url 'foo +> bar') +. +
      +

      link

      +
      +```````````````````````````````` + + +### `md_build_ref_def_hashtable()` + +All link labels in the following example all have the same FNV1a hash (after +normalization of the label, which means after converting to a vector of Unicode +codepoints and lowercase folding). + +So the example triggers quite complex code paths which are not otherwise easily +tested. + +```````````````````````````````` example +[foo]: /foo +[qnptgbh]: /qnptgbh +[abgbrwcv]: /abgbrwcv +[abgbrwcv]: /abgbrwcv2 +[abgbrwcv]: /abgbrwcv3 +[abgbrwcv]: /abgbrwcv4 +[alqadfgn]: /alqadfgn + +[foo] +[qnptgbh] +[abgbrwcv] +[alqadfgn] +[axgydtdu] +. +

      foo +qnptgbh +abgbrwcv +alqadfgn +[axgydtdu]

      +```````````````````````````````` + +For the sake of completeness, the following C program was used to find the hash +collisions by brute force: + +~~~ + +#include +#include + + +static unsigned etalon; + + + +#define MD_FNV1A_BASE 2166136261 +#define MD_FNV1A_PRIME 16777619 + +static inline unsigned +fnv1a(unsigned base, const void* data, size_t n) +{ + const unsigned char* buf = (const unsigned char*) data; + unsigned hash = base; + size_t i; + + for(i = 0; i < n; i++) { + hash ^= buf[i]; + hash *= MD_FNV1A_PRIME; + } + + return hash; +} + + +static unsigned +unicode_hash(const char* data, size_t n) +{ + unsigned value; + unsigned hash = MD_FNV1A_BASE; + int i; + + for(i = 0; i < n; i++) { + value = data[i]; + hash = fnv1a(hash, &value, sizeof(unsigned)); + } + + return hash; +} + + +static void +recurse(char* buffer, size_t off, size_t len) +{ + int ch; + + if(off < len - 1) { + for(ch = 'a'; ch <= 'z'; ch++) { + buffer[off] = ch; + recurse(buffer, off+1, len); + } + } else { + for(ch = 'a'; ch <= 'z'; ch++) { + buffer[off] = ch; + if(unicode_hash(buffer, len) == etalon) { + printf("Dup: %.*s\n", (int)len, buffer); + } + } + } +} + +int +main(int argc, char** argv) +{ + char buffer[32]; + int len; + + if(argc < 2) + etalon = unicode_hash("foo", 3); + else + etalon = unicode_hash(argv[1], strlen(argv[1])); + + for(len = 1; len <= sizeof(buffer); len++) + recurse(buffer, 0, len); + + return 0; +} +~~~ diff --git a/test/fuzz-input/commonmark.md b/test/fuzz-input/commonmark.md new file mode 100644 index 0000000..dc3446e --- /dev/null +++ b/test/fuzz-input/commonmark.md @@ -0,0 +1,41 @@ + +# h1 +## h2 +### h3 +#### h4 +##### h5 +###### h6 + +h1 +== + +h2 +-- + +-------------------- + + indented code + +``` +fenced code +``` + + + +> quote + +* list item +1. list item + +[ref]: /url + +paragraph +© Ӓ ꯍ +`code` +*emph* **strong** ***strong emph*** +_emph_ __strong__ ___strong emph___ +[ref] [ref][] [link](/url) +![ref] ![ref][] ![img](/url) + +www.example.com doe@example.com +\\ \* \. \` \ diff --git a/test/fuzz-input/gfm.md b/test/fuzz-input/gfm.md new file mode 100644 index 0000000..fd51485 --- /dev/null +++ b/test/fuzz-input/gfm.md @@ -0,0 +1,8 @@ +* [ ] unchecked +* [x] checked + + A | B | C +---|--:|:-: +aaa|bbb|ccc + +~del~ ~~del~~ diff --git a/test/fuzz-input/latex-math.md b/test/fuzz-input/latex-math.md new file mode 100644 index 0000000..d17af34 --- /dev/null +++ b/test/fuzz-input/latex-math.md @@ -0,0 +1 @@ +$a^2+b^2=c^2$ $$a^2+b^2=c^2$$ diff --git a/test/fuzz-input/wiki.md b/test/fuzz-input/wiki.md new file mode 100644 index 0000000..a423974 --- /dev/null +++ b/test/fuzz-input/wiki.md @@ -0,0 +1 @@ +[[wiki]] [[wiki|label]] diff --git a/test/latex-math.txt b/test/latex-math.txt new file mode 100644 index 0000000..2a5774c --- /dev/null +++ b/test/latex-math.txt @@ -0,0 +1,39 @@ + +# LaTeX Math + +With the flag `MD_FLAG_LATEXMATHSPANS`, MD4C enables extension for recognition +of LaTeX style math spans. + +A math span is is any text wrapped in dollars or double dollars (`$...$` or +`$$...$$`). + +```````````````````````````````` example +$a+b=c$ Hello, world! +. +

      a+b=c Hello, world!

      +```````````````````````````````` + +If the double dollar sign is used, the math span is a display math span. + +```````````````````````````````` example +This is a display equation: $$\int_a^b x dx$$. +. +

      This is a display equation: \int_a^b x dx.

      +```````````````````````````````` + +Math spans may span multiple lines as they are normal spans: + +```````````````````````````````` example +$$ +\int_a^b +f(x) dx +$$ +. +

      \int_a^b f(x) dx

      +```````````````````````````````` + +Note though that many (simple) renderers may output the math spans just as a +verbatim text. (This includes the HTML renderer used by the `md2html` utility.) + +Only advanced renderers which implement LaTeX math syntax can be expected to +provide better results. diff --git a/test/normalize.py b/test/normalize.py new file mode 100755 index 0000000..6073bf0 --- /dev/null +++ b/test/normalize.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +from html.parser import HTMLParser +import urllib + +try: + from html.parser import HTMLParseError +except ImportError: + # HTMLParseError was removed in Python 3.5. It could never be + # thrown, so we define a placeholder instead. + class HTMLParseError(Exception): + pass + +from html.entities import name2codepoint +import sys +import re +import cgi + +# Normalization code, adapted from +# https://github.com/karlcow/markdown-testsuite/ +significant_attrs = ["alt", "href", "src", "title"] +whitespace_re = re.compile('\s+') +class MyHTMLParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.convert_charrefs = False + self.last = "starttag" + self.in_pre = False + self.output = "" + self.last_tag = "" + def handle_data(self, data): + after_tag = self.last == "endtag" or self.last == "starttag" + after_block_tag = after_tag and self.is_block_tag(self.last_tag) + if after_tag and self.last_tag == "br": + data = data.lstrip('\n') + if not self.in_pre: + data = whitespace_re.sub(' ', data) + if after_block_tag and not self.in_pre: + if self.last == "starttag": + data = data.lstrip() + elif self.last == "endtag": + data = data.strip() + self.output += data + self.last = "data" + def handle_endtag(self, tag): + if tag == "pre": + self.in_pre = False + elif self.is_block_tag(tag): + self.output = self.output.rstrip() + self.output += "" + self.last_tag = tag + self.last = "endtag" + def handle_starttag(self, tag, attrs): + if tag == "pre": + self.in_pre = True + if self.is_block_tag(tag): + self.output = self.output.rstrip() + self.output += "<" + tag + # For now we don't strip out 'extra' attributes, because of + # raw HTML test cases. + # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs) + if attrs: + attrs.sort() + for (k,v) in attrs: + self.output += " " + k + if v in ['href','src']: + self.output += ("=" + '"' + + urllib.quote(urllib.unquote(v), safe='/') + '"') + elif v != None: + self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"') + self.output += ">" + self.last_tag = tag + self.last = "starttag" + def handle_startendtag(self, tag, attrs): + """Ignore closing tag for self-closing """ + self.handle_starttag(tag, attrs) + self.last_tag = tag + self.last = "endtag" + def handle_comment(self, data): + self.output += '' + self.last = "comment" + def handle_decl(self, data): + self.output += '' + self.last = "decl" + def unknown_decl(self, data): + self.output += '' + self.last = "decl" + def handle_pi(self,data): + self.output += '' + self.last = "pi" + def handle_entityref(self, name): + try: + c = chr(name2codepoint[name]) + except KeyError: + c = None + self.output_char(c, '&' + name + ';') + self.last = "ref" + def handle_charref(self, name): + try: + if name.startswith("x"): + c = chr(int(name[1:], 16)) + else: + c = chr(int(name)) + except ValueError: + c = None + self.output_char(c, '&' + name + ';') + self.last = "ref" + # Helpers. + def output_char(self, c, fallback): + if c == '<': + self.output += "<" + elif c == '>': + self.output += ">" + elif c == '&': + self.output += "&" + elif c == '"': + self.output += """ + elif c == None: + self.output += fallback + else: + self.output += c + + def is_block_tag(self,tag): + return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote', + 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas', + 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd', + 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt', + 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption', + 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style']) + +def normalize_html(html): + r""" + Return normalized form of HTML which ignores insignificant output + differences: + + Multiple inner whitespaces are collapsed to a single space (except + in pre tags): + + >>> normalize_html("

      a \t b

      ") + '

      a b

      ' + + >>> normalize_html("

      a \t\nb

      ") + '

      a b

      ' + + * Whitespace surrounding block-level tags is removed. + + >>> normalize_html("

      a b

      ") + '

      a b

      ' + + >>> normalize_html("

      a b

      ") + '

      a b

      ' + + >>> normalize_html("

      a b

      ") + '

      a b

      ' + + >>> normalize_html("\n\t

      \n\t\ta b\t\t

      \n\t") + '

      a b

      ' + + >>> normalize_html("a b ") + 'a b ' + + * Self-closing tags are converted to open tags. + + >>> normalize_html("
      ") + '
      ' + + * Attributes are sorted and lowercased. + + >>> normalize_html('x') + 'x' + + * References are converted to unicode, except that '<', '>', '&', and + '"' are rendered using entities. + + >>> normalize_html("∀&><"") + '\u2200&><"' + + """ + html_chunk_re = re.compile("(\|\<[^>]*\>|[^<]+)") + try: + parser = MyHTMLParser() + # We work around HTMLParser's limitations parsing CDATA + # by breaking the input into chunks and passing CDATA chunks + # through verbatim. + for chunk in re.finditer(html_chunk_re, html): + if chunk.group(0)[:8] == "a a ){65000}b( a a){65000}")), + "many emph closers with no openers": + (("a_ " * 65000), + re.compile("(a[_] ){64999}a_")), + "many emph openers with no closers": + (("_a " * 65000), + re.compile("(_a ){64999}_a")), + "many 3-emph openers with no closers": + (("a***" * 65000), + re.compile("(aa){32500}")), + "many link closers with no openers": + (("a]" * 65000), + re.compile("(a\]){65000}")), + "many link openers with no closers": + (("[a" * 65000), + re.compile("(\[a){65000}")), + "mismatched openers and closers": + (("*a_ " * 50000), + re.compile("([*]a[_] ){49999}[*]a_")), + "openers and closers multiple of 3": + (("a**b" + ("c* " * 50000)), + re.compile("a[*][*]b(c[*] ){49999}c[*]")), + "link openers and emph closers": + (("[ a_" * 50000), + re.compile("(\[ a_){50000}")), + "hard link/emph case": + ("**x [a*b**c*](d)", + re.compile("\\*\\*x ab\\*\\*c")), + "nested brackets": + (("[" * 50000) + "a" + ("]" * 50000), + re.compile("\[{50000}a\]{50000}")), + "nested block quotes": + ((("> " * 50000) + "a"), + re.compile("(
      \r?\n){50000}")), + "U+0000 in input": + ("abc\u0000de\u0000", + re.compile("abc\ufffd?de\ufffd?")), + "backticks": + ("".join(map(lambda x: ("e" + "`" * x), range(1,1000))), + re.compile("^

      [e`]*

      \r?\n$")), + "many links": + ("[t](/u) " * 50000, + re.compile("(t ?){50000}")), + "many references": + ("".join(map(lambda x: ("[" + str(x) + "]: u\n"), range(1,20000 * 16))) + "[0] " * 20000, + re.compile("(\[0\] ){19999}")), + "deeply nested lists": + ("".join(map(lambda x: (" " * x + "* a\n"), range(0,1000))), + re.compile("
        \r?\n(
      • a
          \r?\n){999}
        • a
        • \r?\n
        \r?\n(
      • \r?\n
      \r?\n){999}")), + "many html openers and closers": + (("<>" * 50000), + re.compile("(<>){50000}")), + "many html proc. inst. openers": + (("x" + "\r?\n(
      • \r?\n){49999}
      • a
      • \r?\n
      \r?\n(
    3. \r?\n\r?\n){49999}")) + } + +whitespace_re = re.compile('/s+/') +passed = 0 +errored = 0 +failed = 0 + +#print("Testing pathological cases:") +for description in pathological: + (inp, regex) = pathological[description] + start = timer() + [rc, actual, err] = cmark.to_html(inp) + end = timer() + if rc != 0: + errored += 1 + print('{:35} [ERRORED (return code %d)]'.format(description, rc)) + print(err) + elif regex.search(actual): + print('{:35} [PASSED] {:.3f} secs'.format(description, end-start)) + passed += 1 + else: + print('{:35} [FAILED]'.format(description)) + print(repr(actual)) + failed += 1 + +print("%d passed, %d failed, %d errored" % (passed, failed, errored)) +if (failed == 0 and errored == 0): + exit(0) +else: + exit(1) diff --git a/test/permissive-email-autolinks.txt b/test/permissive-email-autolinks.txt new file mode 100644 index 0000000..12e8786 --- /dev/null +++ b/test/permissive-email-autolinks.txt @@ -0,0 +1,50 @@ + +# Permissive E-mail Autolinks + +With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C enables more permissive +recognition of e-mail addresses and transforms them to autolinks, even if they +do not exactly follow the syntax of autolink as specified in CommonMark +specification. + +This is standard CommonMark e-mail autolink: + +```````````````````````````````` example +E-mail: +. +

      E-mail: mailto:john.doe@gmail.com

      +```````````````````````````````` + +With the permissive autolinks enabled, this is sufficient: + +```````````````````````````````` example +E-mail: john.doe@gmail.com +. +

      E-mail: john.doe@gmail.com

      +```````````````````````````````` + +`+` can occur before the `@`, but not after. + +```````````````````````````````` example +hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is. +. +

      hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.

      +```````````````````````````````` + +`.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at +the end of the email address, in which case it will not be considered part of +the address: + +```````````````````````````````` example +a.b-c_d@a.b + +a.b-c_d@a.b. + +a.b-c_d@a.b- + +a.b-c_d@a.b_ +. +

      a.b-c_d@a.b

      +

      a.b-c_d@a.b.

      +

      a.b-c_d@a.b-

      +

      a.b-c_d@a.b_

      +```````````````````````````````` diff --git a/test/permissive-url-autolinks.txt b/test/permissive-url-autolinks.txt new file mode 100644 index 0000000..03a2069 --- /dev/null +++ b/test/permissive-url-autolinks.txt @@ -0,0 +1,92 @@ + +# Permissive URL Autolinks + +With the flag `MD_FLAG_PERMISSIVEURLAUTOLINKS`, MD4C enables more permissive recognition +of URLs and transform them to autolinks, even if they do not exactly follow the syntax +of autolink as specified in CommonMark specification. + +This is standard CommonMark autolink: + +```````````````````````````````` example +Homepage: +. +

      Homepage: https://github.com/mity/md4c

      +```````````````````````````````` + +With the permissive autolinks enabled, this is sufficient: + +```````````````````````````````` example +Homepage: https://github.com/mity/md4c +. +

      Homepage: https://github.com/mity/md4c

      +```````````````````````````````` + +But this permissive autolink feature can work only for very widely used URL +schemes, in alphabetical order `ftp:`, `http:`, `https:`. + +That's why this is not a permissive autolink: + +```````````````````````````````` example +ssh://root@example.com +. +

      ssh://root@example.com

      +```````````````````````````````` + +The same rules for path validation as for permissivve WWW autolinks apply. +Therefore the final question mark here is not part of the autolink: + +```````````````````````````````` example +Have you ever visited http://www.zombo.com? +. +

      Have you ever visited http://www.zombo.com?

      +```````````````````````````````` + +But in contrast, in this example it is: + +```````````````````````````````` example +http://www.bing.com/search?q=md4c +. +

      http://www.bing.com/search?q=md4c

      +```````````````````````````````` + +And finally one complex example: + +```````````````````````````````` example +http://commonmark.org + +(Visit https://encrypted.google.com/search?q=Markup+(business)) + +Anonymous FTP is available at ftp://foo.bar.baz. +. +

      http://commonmark.org

      +

      (Visit https://encrypted.google.com/search?q=Markup+(business))

      +

      Anonymous FTP is available at ftp://foo.bar.baz.

      +```````````````````````````````` + + +## GitHub Issues + +### [Issue 53](https://github.com/mity/md4c/issues/53) + +```````````````````````````````` example +This is [link](http://github.com/). +. +

      This is link.

      +```````````````````````````````` + +```````````````````````````````` example +This is [link](http://github.com/)X +. +

      This is linkX

      +```````````````````````````````` + + +## [Issue 76](https://github.com/mity/md4c/issues/76) + +```````````````````````````````` example +*(http://example.com)* +. +

      (http://example.com)

      +```````````````````````````````` + + diff --git a/test/permissive-www-autolinks.txt b/test/permissive-www-autolinks.txt new file mode 100644 index 0000000..2830722 --- /dev/null +++ b/test/permissive-www-autolinks.txt @@ -0,0 +1,107 @@ + +# Permissive WWW Autolinks + +With the flag `MD_FLAG_PERMISSIVEWWWAUTOLINKS`, MD4C enables recognition of +autolinks starting with `www.`, even if they do not exactly follow the syntax +of autolink as specified in CommonMark specification. + +These do not have to be enclosed in `<` and `>`, and they even do not need +any preceding scheme specification. + +The WWW autolink will be recognized when a valid domain is found. + +A valid domain consists of the text `www.`, followed by alphanumeric characters, +nderscores (`_`), hyphens (`-`) and periods (`.`). There must be at least one +period, and no underscores may be present in the last two segments of the domain. + +The scheme `http` will be inserted automatically: + +```````````````````````````````` example +www.commonmark.org +. +

      www.commonmark.org

      +```````````````````````````````` + +After a valid domain, zero or more non-space non-`<` characters may follow: + +```````````````````````````````` example +Visit www.commonmark.org/help for more information. +. +

      Visit www.commonmark.org/help for more information.

      +```````````````````````````````` + +We then apply extended autolink path validation as follows: + +Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`) +will not be considered part of the autolink, though they may be included in the +interior of the link: + +```````````````````````````````` example +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. +. +

      Visit www.commonmark.org.

      +

      Visit www.commonmark.org/a.b.

      +```````````````````````````````` + +When an autolink ends in `)`, we scan the entire autolink for the total number +of parentheses. If there is a greater number of closing parentheses than +opening ones, we don't consider the last character part of the autolink, in +order to facilitate including an autolink inside a parenthesis: + +```````````````````````````````` example +www.google.com/search?q=Markup+(business) + +(www.google.com/search?q=Markup+(business)) +. +

      www.google.com/search?q=Markup+(business)

      +

      (www.google.com/search?q=Markup+(business))

      +```````````````````````````````` + +This check is only done when the link ends in a closing parentheses `)`, so if +the only parentheses are in the interior of the autolink, no special rules are +applied: + +```````````````````````````````` example +www.google.com/search?q=(business)+ok +. +

      www.google.com/search?q=(business)+ok

      +```````````````````````````````` + +If an autolink ends in a semicolon (`;`), we check to see if it appears to +resemble an [entity reference][entity references]; if the preceding text is `&` +followed by one or more alphanumeric characters. If so, it is excluded from +the autolink: + +```````````````````````````````` example +www.google.com/search?q=commonmark&hl=en + +www.google.com/search?q=commonmark&hl; +. +

      www.google.com/search?q=commonmark&hl=en

      +

      www.google.com/search?q=commonmark&hl;

      +```````````````````````````````` + +`<` immediately ends an autolink. + +```````````````````````````````` example +www.commonmark.org/hewww.commonmark.org/he<lp

      +```````````````````````````````` + + +## GitHub Issues + +### [Issue 53](https://github.com/mity/md4c/issues/53) +```````````````````````````````` example +This is [link](www.github.com/). +. +

      This is link.

      +```````````````````````````````` +```````````````````````````````` example +This is [link](www.github.com/)X +. +

      This is linkX

      +```````````````````````````````` diff --git a/test/spec.txt b/test/spec.txt new file mode 100644 index 0000000..fa56572 --- /dev/null +++ b/test/spec.txt @@ -0,0 +1,9709 @@ +--- +title: CommonMark Spec +author: John MacFarlane +version: 0.29 +date: '2019-04-06' +license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' +... + +# Introduction + +## What is Markdown? + +Markdown is a plain text format for writing structured documents, +based on conventions for indicating formatting in email +and usenet posts. It was developed by John Gruber (with +help from Aaron Swartz) and released in 2004 in the form of a +[syntax description](http://daringfireball.net/projects/markdown/syntax) +and a Perl script (`Markdown.pl`) for converting Markdown to +HTML. In the next decade, dozens of implementations were +developed in many languages. Some extended the original +Markdown syntax with conventions for footnotes, tables, and +other document elements. Some allowed Markdown documents to be +rendered in formats other than HTML. Websites like Reddit, +StackOverflow, and GitHub had millions of people using Markdown. +And Markdown started to be used beyond the web, to author books, +articles, slide shows, letters, and lecture notes. + +What distinguishes Markdown from many other lightweight markup +syntaxes, which are often easier to write, is its readability. +As Gruber writes: + +> The overriding design goal for Markdown's formatting syntax is +> to make it as readable as possible. The idea is that a +> Markdown-formatted document should be publishable as-is, as +> plain text, without looking like it's been marked up with tags +> or formatting instructions. +> () + +The point can be illustrated by comparing a sample of +[AsciiDoc](http://www.methods.co.nz/asciidoc/) with +an equivalent sample of Markdown. Here is a sample of +AsciiDoc from the AsciiDoc manual: + +``` +1. List item one. ++ +List item one continued with a second paragraph followed by an +Indented block. ++ +................. +$ ls *.sh +$ mv *.sh ~/tmp +................. ++ +List item continued with a third paragraph. + +2. List item two continued with an open block. ++ +-- +This paragraph is part of the preceding list item. + +a. This list is nested and does not require explicit item +continuation. ++ +This paragraph is part of the preceding list item. + +b. List item b. + +This paragraph belongs to item two of the outer list. +-- +``` + +And here is the equivalent in Markdown: +``` +1. List item one. + + List item one continued with a second paragraph followed by an + Indented block. + + $ ls *.sh + $ mv *.sh ~/tmp + + List item continued with a third paragraph. + +2. List item two continued with an open block. + + This paragraph is part of the preceding list item. + + 1. This list is nested and does not require explicit item continuation. + + This paragraph is part of the preceding list item. + + 2. List item b. + + This paragraph belongs to item two of the outer list. +``` + +The AsciiDoc version is, arguably, easier to write. You don't need +to worry about indentation. But the Markdown version is much easier +to read. The nesting of list items is apparent to the eye in the +source, not just in the processed document. + +## Why is a spec needed? + +John Gruber's [canonical description of Markdown's +syntax](http://daringfireball.net/projects/markdown/syntax) +does not specify the syntax unambiguously. Here are some examples of +questions it does not answer: + +1. How much indentation is needed for a sublist? The spec says that + continuation paragraphs need to be indented four spaces, but is + not fully explicit about sublists. It is natural to think that + they, too, must be indented four spaces, but `Markdown.pl` does + not require that. This is hardly a "corner case," and divergences + between implementations on this issue often lead to surprises for + users in real documents. (See [this comment by John + Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).) + +2. Is a blank line needed before a block quote or heading? + Most implementations do not require the blank line. However, + this can lead to unexpected results in hard-wrapped text, and + also to ambiguities in parsing (note that some implementations + put the heading inside the blockquote, while others do not). + (John Gruber has also spoken [in favor of requiring the blank + lines](http://article.gmane.org/gmane.text.markdown.general/2146).) + +3. Is a blank line needed before an indented code block? + (`Markdown.pl` requires it, but this is not mentioned in the + documentation, and some implementations do not require it.) + + ``` markdown + paragraph + code? + ``` + +4. What is the exact rule for determining when list items get + wrapped in `

      ` tags? Can a list be partially "loose" and partially + "tight"? What should we do with a list like this? + + ``` markdown + 1. one + + 2. two + 3. three + ``` + + Or this? + + ``` markdown + 1. one + - a + + - b + 2. two + ``` + + (There are some relevant comments by John Gruber + [here](http://article.gmane.org/gmane.text.markdown.general/2554).) + +5. Can list markers be indented? Can ordered list markers be right-aligned? + + ``` markdown + 8. item 1 + 9. item 2 + 10. item 2a + ``` + +6. Is this one list with a thematic break in its second item, + or two lists separated by a thematic break? + + ``` markdown + * a + * * * * * + * b + ``` + +7. When list markers change from numbers to bullets, do we have + two lists or one? (The Markdown syntax description suggests two, + but the perl scripts and many other implementations produce one.) + + ``` markdown + 1. fee + 2. fie + - foe + - fum + ``` + +8. What are the precedence rules for the markers of inline structure? + For example, is the following a valid link, or does the code span + take precedence ? + + ``` markdown + [a backtick (`)](/url) and [another backtick (`)](/url). + ``` + +9. What are the precedence rules for markers of emphasis and strong + emphasis? For example, how should the following be parsed? + + ``` markdown + *foo *bar* baz* + ``` + +10. What are the precedence rules between block-level and inline-level + structure? For example, how should the following be parsed? + + ``` markdown + - `a long code span can contain a hyphen like this + - and it can screw things up` + ``` + +11. Can list items include section headings? (`Markdown.pl` does not + allow this, but does allow blockquotes to include headings.) + + ``` markdown + - # Heading + ``` + +12. Can list items be empty? + + ``` markdown + * a + * + * b + ``` + +13. Can link references be defined inside block quotes or list items? + + ``` markdown + > Blockquote [foo]. + > + > [foo]: /url + ``` + +14. If there are multiple definitions for the same reference, which takes + precedence? + + ``` markdown + [foo]: /url1 + [foo]: /url2 + + [foo][] + ``` + +In the absence of a spec, early implementers consulted `Markdown.pl` +to resolve these ambiguities. But `Markdown.pl` was quite buggy, and +gave manifestly bad results in many cases, so it was not a +satisfactory replacement for a spec. + +Because there is no unambiguous spec, implementations have diverged +considerably. As a result, users are often surprised to find that +a document that renders one way on one system (say, a GitHub wiki) +renders differently on another (say, converting to docbook using +pandoc). To make matters worse, because nothing in Markdown counts +as a "syntax error," the divergence often isn't discovered right away. + +## About this document + +This document attempts to specify Markdown syntax unambiguously. +It contains many examples with side-by-side Markdown and +HTML. These are intended to double as conformance tests. An +accompanying script `spec_tests.py` can be used to run the tests +against any Markdown program: + + python test/spec_tests.py --spec spec.txt --program PROGRAM + +Since this document describes how Markdown is to be parsed into +an abstract syntax tree, it would have made sense to use an abstract +representation of the syntax tree instead of HTML. But HTML is capable +of representing the structural distinctions we need to make, and the +choice of HTML for the tests makes it possible to run the tests against +an implementation without writing an abstract syntax tree renderer. + +This document is generated from a text file, `spec.txt`, written +in Markdown with a small extension for the side-by-side tests. +The script `tools/makespec.py` can be used to convert `spec.txt` into +HTML or CommonMark (which can then be converted into other formats). + +In the examples, the `→` character is used to represent tabs. + +# Preliminaries + +## Characters and lines + +Any sequence of [characters] is a valid CommonMark +document. + +A [character](@) is a Unicode code point. Although some +code points (for example, combining accents) do not correspond to +characters in an intuitive sense, all code points count as characters +for purposes of this spec. + +This spec does not specify an encoding; it thinks of lines as composed +of [characters] rather than bytes. A conforming parser may be limited +to a certain encoding. + +A [line](@) is a sequence of zero or more [characters] +other than newline (`U+000A`) or carriage return (`U+000D`), +followed by a [line ending] or by the end of file. + +A [line ending](@) is a newline (`U+000A`), a carriage return +(`U+000D`) not followed by a newline, or a carriage return and a +following newline. + +A line containing no characters, or a line containing only spaces +(`U+0020`) or tabs (`U+0009`), is called a [blank line](@). + +The following definitions of character classes will be used in this spec: + +A [whitespace character](@) is a space +(`U+0020`), tab (`U+0009`), newline (`U+000A`), line tabulation (`U+000B`), +form feed (`U+000C`), or carriage return (`U+000D`). + +[Whitespace](@) is a sequence of one or more [whitespace +characters]. + +A [Unicode whitespace character](@) is +any code point in the Unicode `Zs` general category, or a tab (`U+0009`), +carriage return (`U+000D`), newline (`U+000A`), or form feed +(`U+000C`). + +[Unicode whitespace](@) is a sequence of one +or more [Unicode whitespace characters]. + +A [space](@) is `U+0020`. + +A [non-whitespace character](@) is any character +that is not a [whitespace character]. + +An [ASCII punctuation character](@) +is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). + +A [punctuation character](@) is an [ASCII +punctuation character] or anything in +the general Unicode categories `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. + +## Tabs + +Tabs in lines are not expanded to [spaces]. However, +in contexts where whitespace helps to define block structure, +tabs behave as if they were replaced by spaces with a tab stop +of 4 characters. + +Thus, for example, a tab can be used instead of four spaces +in an indented code block. (Note, however, that internal +tabs are passed through as literal tabs, not expanded to +spaces.) + +```````````````````````````````` example +→foo→baz→→bim +. +

      foo→baz→→bim
      +
      +```````````````````````````````` + +```````````````````````````````` example + →foo→baz→→bim +. +
      foo→baz→→bim
      +
      +```````````````````````````````` + +```````````````````````````````` example + a→a + ὐ→a +. +
      a→a
      +ὐ→a
      +
      +```````````````````````````````` + +In the following example, a continuation paragraph of a list +item is indented with a tab; this has exactly the same effect +as indentation with four spaces would: + +```````````````````````````````` example + - foo + +→bar +. +
        +
      • +

        foo

        +

        bar

        +
      • +
      +```````````````````````````````` + +```````````````````````````````` example +- foo + +→→bar +. +
        +
      • +

        foo

        +
          bar
        +
        +
      • +
      +```````````````````````````````` + +Normally the `>` that begins a block quote may be followed +optionally by a space, which is not considered part of the +content. In the following case `>` is followed by a tab, +which is treated as if it were expanded into three spaces. +Since one of these spaces is considered part of the +delimiter, `foo` is considered to be indented six spaces +inside the block quote context, so we get an indented +code block starting with two spaces. + +```````````````````````````````` example +>→→foo +. +
      +
        foo
      +
      +
      +```````````````````````````````` + +```````````````````````````````` example +-→→foo +. +
        +
      • +
          foo
        +
        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example + foo +→bar +. +
      foo
      +bar
      +
      +```````````````````````````````` + +```````````````````````````````` example + - foo + - bar +→ - baz +. +
        +
      • foo +
          +
        • bar +
            +
          • baz
          • +
          +
        • +
        +
      • +
      +```````````````````````````````` + +```````````````````````````````` example +#→Foo +. +

      Foo

      +```````````````````````````````` + +```````````````````````````````` example +*→*→*→ +. +
      +```````````````````````````````` + + +## Insecure characters + +For security reasons, the Unicode character `U+0000` must be replaced +with the REPLACEMENT CHARACTER (`U+FFFD`). + +# Blocks and inlines + +We can think of a document as a sequence of +[blocks](@)---structural elements like paragraphs, block +quotations, lists, headings, rules, and code blocks. Some blocks (like +block quotes and list items) contain other blocks; others (like +headings and paragraphs) contain [inline](@) content---text, +links, emphasized text, images, code spans, and so on. + +## Precedence + +Indicators of block structure always take precedence over indicators +of inline structure. So, for example, the following is a list with +two items, not a list with one item containing a code span: + +```````````````````````````````` example +- `one +- two` +. +
        +
      • `one
      • +
      • two`
      • +
      +```````````````````````````````` + + +This means that parsing can proceed in two steps: first, the block +structure of the document can be discerned; second, text lines inside +paragraphs, headings, and other block constructs can be parsed for inline +structure. The second step requires information about link reference +definitions that will be available only at the end of the first +step. Note that the first step requires processing lines in sequence, +but the second can be parallelized, since the inline parsing of +one block element does not affect the inline parsing of any other. + +## Container blocks and leaf blocks + +We can divide blocks into two types: +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), +which cannot. + +# Leaf blocks + +This section describes the different kinds of leaf block that make up a +Markdown document. + +## Thematic breaks + +A line consisting of 0-3 spaces of indentation, followed by a sequence +of three or more matching `-`, `_`, or `*` characters, each followed +optionally by any number of spaces or tabs, forms a +[thematic break](@). + +```````````````````````````````` example +*** +--- +___ +. +
      +
      +
      +```````````````````````````````` + + +Wrong characters: + +```````````````````````````````` example ++++ +. +

      +++

      +```````````````````````````````` + + +```````````````````````````````` example +=== +. +

      ===

      +```````````````````````````````` + + +Not enough characters: + +```````````````````````````````` example +-- +** +__ +. +

      -- +** +__

      +```````````````````````````````` + + +One to three spaces indent are allowed: + +```````````````````````````````` example + *** + *** + *** +. +
      +
      +
      +```````````````````````````````` + + +Four spaces is too many: + +```````````````````````````````` example + *** +. +
      ***
      +
      +```````````````````````````````` + + +```````````````````````````````` example +Foo + *** +. +

      Foo +***

      +```````````````````````````````` + + +More than three characters may be used: + +```````````````````````````````` example +_____________________________________ +. +
      +```````````````````````````````` + + +Spaces are allowed between the characters: + +```````````````````````````````` example + - - - +. +
      +```````````````````````````````` + + +```````````````````````````````` example + ** * ** * ** * ** +. +
      +```````````````````````````````` + + +```````````````````````````````` example +- - - - +. +
      +```````````````````````````````` + + +Spaces are allowed at the end: + +```````````````````````````````` example +- - - - +. +
      +```````````````````````````````` + + +However, no other characters may occur in the line: + +```````````````````````````````` example +_ _ _ _ a + +a------ + +---a--- +. +

      _ _ _ _ a

      +

      a------

      +

      ---a---

      +```````````````````````````````` + + +It is required that all of the [non-whitespace characters] be the same. +So, this is not a thematic break: + +```````````````````````````````` example + *-* +. +

      -

      +```````````````````````````````` + + +Thematic breaks do not need blank lines before or after: + +```````````````````````````````` example +- foo +*** +- bar +. +
        +
      • foo
      • +
      +
      +
        +
      • bar
      • +
      +```````````````````````````````` + + +Thematic breaks can interrupt a paragraph: + +```````````````````````````````` example +Foo +*** +bar +. +

      Foo

      +
      +

      bar

      +```````````````````````````````` + + +If a line of dashes that meets the above conditions for being a +thematic break could also be interpreted as the underline of a [setext +heading], the interpretation as a +[setext heading] takes precedence. Thus, for example, +this is a setext heading, not a paragraph followed by a thematic break: + +```````````````````````````````` example +Foo +--- +bar +. +

      Foo

      +

      bar

      +```````````````````````````````` + + +When both a thematic break and a list item are possible +interpretations of a line, the thematic break takes precedence: + +```````````````````````````````` example +* Foo +* * * +* Bar +. +
        +
      • Foo
      • +
      +
      +
        +
      • Bar
      • +
      +```````````````````````````````` + + +If you want a thematic break in a list item, use a different bullet: + +```````````````````````````````` example +- Foo +- * * * +. +
        +
      • Foo
      • +
      • +
        +
      • +
      +```````````````````````````````` + + +## ATX headings + +An [ATX heading](@) +consists of a string of characters, parsed as inline content, between an +opening sequence of 1--6 unescaped `#` characters and an optional +closing sequence of any number of unescaped `#` characters. +The opening sequence of `#` characters must be followed by a +[space] or by the end of line. The optional closing sequence of `#`s must be +preceded by a [space] and may be followed by spaces only. The opening +`#` character may be indented 0-3 spaces. The raw contents of the +heading are stripped of leading and trailing spaces before being parsed +as inline content. The heading level is equal to the number of `#` +characters in the opening sequence. + +Simple headings: + +```````````````````````````````` example +# foo +## foo +### foo +#### foo +##### foo +###### foo +. +

      foo

      +

      foo

      +

      foo

      +

      foo

      +
      foo
      +
      foo
      +```````````````````````````````` + + +More than six `#` characters is not a heading: + +```````````````````````````````` example +####### foo +. +

      ####### foo

      +```````````````````````````````` + + +At least one space is required between the `#` characters and the +heading's contents, unless the heading is empty. Note that many +implementations currently do not require the space. However, the +space was required by the +[original ATX implementation](http://www.aaronsw.com/2002/atx/atx.py), +and it helps prevent things like the following from being parsed as +headings: + +```````````````````````````````` example +#5 bolt + +#hashtag +. +

      #5 bolt

      +

      #hashtag

      +```````````````````````````````` + + +This is not a heading, because the first `#` is escaped: + +```````````````````````````````` example +\## foo +. +

      ## foo

      +```````````````````````````````` + + +Contents are parsed as inlines: + +```````````````````````````````` example +# foo *bar* \*baz\* +. +

      foo bar *baz*

      +```````````````````````````````` + + +Leading and trailing [whitespace] is ignored in parsing inline content: + +```````````````````````````````` example +# foo +. +

      foo

      +```````````````````````````````` + + +One to three spaces indentation are allowed: + +```````````````````````````````` example + ### foo + ## foo + # foo +. +

      foo

      +

      foo

      +

      foo

      +```````````````````````````````` + + +Four spaces are too much: + +```````````````````````````````` example + # foo +. +
      # foo
      +
      +```````````````````````````````` + + +```````````````````````````````` example +foo + # bar +. +

      foo +# bar

      +```````````````````````````````` + + +A closing sequence of `#` characters is optional: + +```````````````````````````````` example +## foo ## + ### bar ### +. +

      foo

      +

      bar

      +```````````````````````````````` + + +It need not be the same length as the opening sequence: + +```````````````````````````````` example +# foo ################################## +##### foo ## +. +

      foo

      +
      foo
      +```````````````````````````````` + + +Spaces are allowed after the closing sequence: + +```````````````````````````````` example +### foo ### +. +

      foo

      +```````````````````````````````` + + +A sequence of `#` characters with anything but [spaces] following it +is not a closing sequence, but counts as part of the contents of the +heading: + +```````````````````````````````` example +### foo ### b +. +

      foo ### b

      +```````````````````````````````` + + +The closing sequence must be preceded by a space: + +```````````````````````````````` example +# foo# +. +

      foo#

      +```````````````````````````````` + + +Backslash-escaped `#` characters do not count as part +of the closing sequence: + +```````````````````````````````` example +### foo \### +## foo #\## +# foo \# +. +

      foo ###

      +

      foo ###

      +

      foo #

      +```````````````````````````````` + + +ATX headings need not be separated from surrounding content by blank +lines, and they can interrupt paragraphs: + +```````````````````````````````` example +**** +## foo +**** +. +
      +

      foo

      +
      +```````````````````````````````` + + +```````````````````````````````` example +Foo bar +# baz +Bar foo +. +

      Foo bar

      +

      baz

      +

      Bar foo

      +```````````````````````````````` + + +ATX headings can be empty: + +```````````````````````````````` example +## +# +### ### +. +

      +

      +

      +```````````````````````````````` + + +## Setext headings + +A [setext heading](@) consists of one or more +lines of text, each containing at least one [non-whitespace +character], with no more than 3 spaces indentation, followed by +a [setext heading underline]. The lines of text must be such +that, were they not followed by the setext heading underline, +they would be interpreted as a paragraph: they cannot be +interpretable as a [code fence], [ATX heading][ATX headings], +[block quote][block quotes], [thematic break][thematic breaks], +[list item][list items], or [HTML block][HTML blocks]. + +A [setext heading underline](@) is a sequence of +`=` characters or a sequence of `-` characters, with no more than 3 +spaces indentation and any number of trailing spaces. If a line +containing a single `-` can be interpreted as an +empty [list items], it should be interpreted this way +and not as a [setext heading underline]. + +The heading is a level 1 heading if `=` characters are used in +the [setext heading underline], and a level 2 heading if `-` +characters are used. The contents of the heading are the result +of parsing the preceding lines of text as CommonMark inline +content. + +In general, a setext heading need not be preceded or followed by a +blank line. However, it cannot interrupt a paragraph, so when a +setext heading comes after a paragraph, a blank line is needed between +them. + +Simple examples: + +```````````````````````````````` example +Foo *bar* +========= + +Foo *bar* +--------- +. +

      Foo bar

      +

      Foo bar

      +```````````````````````````````` + + +The content of the header may span more than one line: + +```````````````````````````````` example +Foo *bar +baz* +==== +. +

      Foo bar +baz

      +```````````````````````````````` + +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +

      Foo bar +baz

      +```````````````````````````````` + + +The underlining can be any length: + +```````````````````````````````` example +Foo +------------------------- + +Foo += +. +

      Foo

      +

      Foo

      +```````````````````````````````` + + +The heading content can be indented up to three spaces, and need +not line up with the underlining: + +```````````````````````````````` example + Foo +--- + + Foo +----- + + Foo + === +. +

      Foo

      +

      Foo

      +

      Foo

      +```````````````````````````````` + + +Four spaces indent is too much: + +```````````````````````````````` example + Foo + --- + + Foo +--- +. +
      Foo
      +---
      +
      +Foo
      +
      +
      +```````````````````````````````` + + +The setext heading underline can be indented up to three spaces, and +may have trailing spaces: + +```````````````````````````````` example +Foo + ---- +. +

      Foo

      +```````````````````````````````` + + +Four spaces is too much: + +```````````````````````````````` example +Foo + --- +. +

      Foo +---

      +```````````````````````````````` + + +The setext heading underline cannot contain internal spaces: + +```````````````````````````````` example +Foo += = + +Foo +--- - +. +

      Foo += =

      +

      Foo

      +
      +```````````````````````````````` + + +Trailing spaces in the content line do not cause a line break: + +```````````````````````````````` example +Foo +----- +. +

      Foo

      +```````````````````````````````` + + +Nor does a backslash at the end: + +```````````````````````````````` example +Foo\ +---- +. +

      Foo\

      +```````````````````````````````` + + +Since indicators of block structure take precedence over +indicators of inline structure, the following are setext headings: + +```````````````````````````````` example +`Foo +---- +` + + +. +

      `Foo

      +

      `

      +

      <a title="a lot

      +

      of dashes"/>

      +```````````````````````````````` + + +The setext heading underline cannot be a [lazy continuation +line] in a list item or block quote: + +```````````````````````````````` example +> Foo +--- +. +
      +

      Foo

      +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> foo +bar +=== +. +
      +

      foo +bar +===

      +
      +```````````````````````````````` + + +```````````````````````````````` example +- Foo +--- +. +
        +
      • Foo
      • +
      +
      +```````````````````````````````` + + +A blank line is needed between a paragraph and a following +setext heading, since otherwise the paragraph becomes part +of the heading's content: + +```````````````````````````````` example +Foo +Bar +--- +. +

      Foo +Bar

      +```````````````````````````````` + + +But in general a blank line is not required before or after +setext headings: + +```````````````````````````````` example +--- +Foo +--- +Bar +--- +Baz +. +
      +

      Foo

      +

      Bar

      +

      Baz

      +```````````````````````````````` + + +Setext headings cannot be empty: + +```````````````````````````````` example + +==== +. +

      ====

      +```````````````````````````````` + + +Setext heading text lines must not be interpretable as block +constructs other than paragraphs. So, the line of dashes +in these examples gets interpreted as a thematic break: + +```````````````````````````````` example +--- +--- +. +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +- foo +----- +. +
        +
      • foo
      • +
      +
      +```````````````````````````````` + + +```````````````````````````````` example + foo +--- +. +
      foo
      +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> foo +----- +. +
      +

      foo

      +
      +
      +```````````````````````````````` + + +If you want a heading with `> foo` as its literal text, you can +use backslash escapes: + +```````````````````````````````` example +\> foo +------ +. +

      > foo

      +```````````````````````````````` + + +**Compatibility note:** Most existing Markdown implementations +do not allow the text of setext headings to span multiple lines. +But there is no consensus about how to interpret + +``` markdown +Foo +bar +--- +baz +``` + +One can find four different interpretations: + +1. paragraph "Foo", heading "bar", paragraph "baz" +2. paragraph "Foo bar", thematic break, paragraph "baz" +3. paragraph "Foo bar --- baz" +4. heading "Foo bar", paragraph "baz" + +We find interpretation 4 most natural, and interpretation 4 +increases the expressive power of CommonMark, by allowing +multiline headings. Authors who want interpretation 1 can +put a blank line after the first paragraph: + +```````````````````````````````` example +Foo + +bar +--- +baz +. +

      Foo

      +

      bar

      +

      baz

      +```````````````````````````````` + + +Authors who want interpretation 2 can put blank lines around +the thematic break, + +```````````````````````````````` example +Foo +bar + +--- + +baz +. +

      Foo +bar

      +
      +

      baz

      +```````````````````````````````` + + +or use a thematic break that cannot count as a [setext heading +underline], such as + +```````````````````````````````` example +Foo +bar +* * * +baz +. +

      Foo +bar

      +
      +

      baz

      +```````````````````````````````` + + +Authors who want interpretation 3 can use backslash escapes: + +```````````````````````````````` example +Foo +bar +\--- +baz +. +

      Foo +bar +--- +baz

      +```````````````````````````````` + + +## Indented code blocks + +An [indented code block](@) is composed of one or more +[indented chunks] separated by blank lines. +An [indented chunk](@) is a sequence of non-blank lines, +each indented four or more spaces. The contents of the code block are +the literal contents of the lines, including trailing +[line endings], minus four spaces of indentation. +An indented code block has no [info string]. + +An indented code block cannot interrupt a paragraph, so there must be +a blank line between a paragraph and a following indented code block. +(A blank line is not needed, however, between a code block and a following +paragraph.) + +```````````````````````````````` example + a simple + indented code block +. +
      a simple
      +  indented code block
      +
      +```````````````````````````````` + + +If there is any ambiguity between an interpretation of indentation +as a code block and as indicating that material belongs to a [list +item][list items], the list item interpretation takes precedence: + +```````````````````````````````` example + - foo + + bar +. +
        +
      • +

        foo

        +

        bar

        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +1. foo + + - bar +. +
        +
      1. +

        foo

        +
          +
        • bar
        • +
        +
      2. +
      +```````````````````````````````` + + + +The contents of a code block are literal text, and do not get parsed +as Markdown: + +```````````````````````````````` example +
      + *hi* + + - one +. +
      <a/>
      +*hi*
      +
      +- one
      +
      +```````````````````````````````` + + +Here we have three chunks separated by blank lines: + +```````````````````````````````` example + chunk1 + + chunk2 + + + + chunk3 +. +
      chunk1
      +
      +chunk2
      +
      +
      +
      +chunk3
      +
      +```````````````````````````````` + + +Any initial spaces beyond four will be included in the content, even +in interior blank lines: + +```````````````````````````````` example + chunk1 + + chunk2 +. +
      chunk1
      +  
      +  chunk2
      +
      +```````````````````````````````` + + +An indented code block cannot interrupt a paragraph. (This +allows hanging indents and the like.) + +```````````````````````````````` example +Foo + bar + +. +

      Foo +bar

      +```````````````````````````````` + + +However, any non-blank line with fewer than four leading spaces ends +the code block immediately. So a paragraph may occur immediately +after indented code: + +```````````````````````````````` example + foo +bar +. +
      foo
      +
      +

      bar

      +```````````````````````````````` + + +And indented code can occur immediately before and after other kinds of +blocks: + +```````````````````````````````` example +# Heading + foo +Heading +------ + foo +---- +. +

      Heading

      +
      foo
      +
      +

      Heading

      +
      foo
      +
      +
      +```````````````````````````````` + + +The first line can be indented more than four spaces: + +```````````````````````````````` example + foo + bar +. +
          foo
      +bar
      +
      +```````````````````````````````` + + +Blank lines preceding or following an indented code block +are not included in it: + +```````````````````````````````` example + + + foo + + +. +
      foo
      +
      +```````````````````````````````` + + +Trailing spaces are included in the code block's content: + +```````````````````````````````` example + foo +. +
      foo  
      +
      +```````````````````````````````` + + + +## Fenced code blocks + +A [code fence](@) is a sequence +of at least three consecutive backtick characters (`` ` ``) or +tildes (`~`). (Tildes and backticks cannot be mixed.) +A [fenced code block](@) +begins with a code fence, indented no more than three spaces. + +The line with the opening code fence may optionally contain some text +following the code fence; this is trimmed of leading and trailing +whitespace and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick +characters. (The reason for this restriction is that otherwise +some inline code would be incorrectly interpreted as the +beginning of a fenced code block.) + +The content of the code block consists of all subsequent lines, until +a closing [code fence] of the same type as the code block +began with (backticks or tildes), and with at least as many backticks +or tildes as the opening code fence. If the leading code fence is +indented N spaces, then up to N spaces of indentation are removed from +each line of the content (if present). (If a content line is not +indented, it is preserved unchanged. If it is indented less than N +spaces, all of the indentation is removed.) + +The closing code fence may be indented up to three spaces, and may be +followed only by spaces, which are ignored. If the end of the +containing block (or document) is reached and no closing code fence +has been found, the code block contains all of the lines after the +opening code fence until the end of the containing block (or +document). (An alternative spec would require backtracking in the +event that a closing code fence is not found. But this makes parsing +much less efficient, and there seems to be no real down side to the +behavior described here.) + +A fenced code block may interrupt a paragraph, and does not require +a blank line either before or after. + +The content of a code fence is treated as literal text, not parsed +as inlines. The first word of the [info string] is typically used to +specify the language of the code sample, and rendered in the `class` +attribute of the `code` tag. However, this spec does not mandate any +particular treatment of the [info string]. + +Here is a simple example with backticks: + +```````````````````````````````` example +``` +< + > +``` +. +
      <
      + >
      +
      +```````````````````````````````` + + +With tildes: + +```````````````````````````````` example +~~~ +< + > +~~~ +. +
      <
      + >
      +
      +```````````````````````````````` + +Fewer than three backticks is not enough: + +```````````````````````````````` example +`` +foo +`` +. +

      foo

      +```````````````````````````````` + +The closing code fence must use the same character as the opening +fence: + +```````````````````````````````` example +``` +aaa +~~~ +``` +. +
      aaa
      +~~~
      +
      +```````````````````````````````` + + +```````````````````````````````` example +~~~ +aaa +``` +~~~ +. +
      aaa
      +```
      +
      +```````````````````````````````` + + +The closing code fence must be at least as long as the opening fence: + +```````````````````````````````` example +```` +aaa +``` +`````` +. +
      aaa
      +```
      +
      +```````````````````````````````` + + +```````````````````````````````` example +~~~~ +aaa +~~~ +~~~~ +. +
      aaa
      +~~~
      +
      +```````````````````````````````` + + +Unclosed code blocks are closed by the end of the document +(or the enclosing [block quote][block quotes] or [list item][list items]): + +```````````````````````````````` example +``` +. +
      +```````````````````````````````` + + +```````````````````````````````` example +````` + +``` +aaa +. +
      
      +```
      +aaa
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> ``` +> aaa + +bbb +. +
      +
      aaa
      +
      +
      +

      bbb

      +```````````````````````````````` + + +A code block can have all empty lines as its content: + +```````````````````````````````` example +``` + + +``` +. +
      
      +  
      +
      +```````````````````````````````` + + +A code block can be empty: + +```````````````````````````````` example +``` +``` +. +
      +```````````````````````````````` + + +Fences can be indented. If the opening fence is indented, +content lines will have equivalent opening indentation removed, +if present: + +```````````````````````````````` example + ``` + aaa +aaa +``` +. +
      aaa
      +aaa
      +
      +```````````````````````````````` + + +```````````````````````````````` example + ``` +aaa + aaa +aaa + ``` +. +
      aaa
      +aaa
      +aaa
      +
      +```````````````````````````````` + + +```````````````````````````````` example + ``` + aaa + aaa + aaa + ``` +. +
      aaa
      + aaa
      +aaa
      +
      +```````````````````````````````` + + +Four spaces indentation produces an indented code block: + +```````````````````````````````` example + ``` + aaa + ``` +. +
      ```
      +aaa
      +```
      +
      +```````````````````````````````` + + +Closing fences may be indented by 0-3 spaces, and their indentation +need not match that of the opening fence: + +```````````````````````````````` example +``` +aaa + ``` +. +
      aaa
      +
      +```````````````````````````````` + + +```````````````````````````````` example + ``` +aaa + ``` +. +
      aaa
      +
      +```````````````````````````````` + + +This is not a closing fence, because it is indented 4 spaces: + +```````````````````````````````` example +``` +aaa + ``` +. +
      aaa
      +    ```
      +
      +```````````````````````````````` + + + +Code fences (opening and closing) cannot contain internal spaces: + +```````````````````````````````` example +``` ``` +aaa +. +

      +aaa

      +```````````````````````````````` + + +```````````````````````````````` example +~~~~~~ +aaa +~~~ ~~ +. +
      aaa
      +~~~ ~~
      +
      +```````````````````````````````` + + +Fenced code blocks can interrupt paragraphs, and can be followed +directly by paragraphs, without a blank line between: + +```````````````````````````````` example +foo +``` +bar +``` +baz +. +

      foo

      +
      bar
      +
      +

      baz

      +```````````````````````````````` + + +Other blocks can also occur before and after fenced code blocks +without an intervening blank line: + +```````````````````````````````` example +foo +--- +~~~ +bar +~~~ +# baz +. +

      foo

      +
      bar
      +
      +

      baz

      +```````````````````````````````` + + +An [info string] can be provided after the opening code fence. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. + +```````````````````````````````` example +```ruby +def foo(x) + return 3 +end +``` +. +
      def foo(x)
      +  return 3
      +end
      +
      +```````````````````````````````` + + +```````````````````````````````` example +~~~~ ruby startline=3 $%@#$ +def foo(x) + return 3 +end +~~~~~~~ +. +
      def foo(x)
      +  return 3
      +end
      +
      +```````````````````````````````` + + +```````````````````````````````` example +````; +```` +. +
      +```````````````````````````````` + + +[Info strings] for backtick code blocks cannot contain backticks: + +```````````````````````````````` example +``` aa ``` +foo +. +

      aa +foo

      +```````````````````````````````` + + +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +
      foo
      +
      +```````````````````````````````` + + +Closing code fences cannot have [info strings]: + +```````````````````````````````` example +``` +``` aaa +``` +. +
      ``` aaa
      +
      +```````````````````````````````` + + + +## HTML blocks + +An [HTML block](@) is a group of lines that is treated +as raw HTML (and will not be escaped in HTML output). + +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three spaces optional indentation). +It ends with the first subsequent line that meets a matching [end +condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. + +1. **Start condition:** line begins with the string ``, or the end of the line.\ +**End condition:** line contains an end tag +``, `
      `, or `` (case-insensitive; it +need not match the start tag). + +2. **Start condition:** line begins with the string ``. + +3. **Start condition:** line begins with the string ``. + +4. **Start condition:** line begins with the string ``. + +5. **Start condition:** line begins with the string +``. + +6. **Start condition:** line begins the string `<` or ``, or +the string `/>`.\ +**End condition:** line is followed by a [blank line]. + +7. **Start condition:** line begins with a complete [open tag] +(with any [tag name] other than `script`, +`style`, or `pre`) or a complete [closing tag], +followed only by [whitespace] or the end of the line.\ +**End condition:** line is followed by a [blank line]. + +HTML blocks continue until they are closed by their appropriate +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. + +For instance, `
      ` within a HTML block started by `` will not affect
      +the parser state; as the HTML block was started in by start condition 6, it
      +will end at any blank line. This can be surprising:
      +
      +```````````````````````````````` example
      +
      +
      +**Hello**,
      +
      +_world_.
      +
      +
      +. +
      +
      +**Hello**,
      +

      world. +

      +
      +```````````````````````````````` + +In this case, the HTML block is terminated by the newline — the `**Hello**` +text remains verbatim — and regular parsing resumes, with a paragraph, +emphasised `world` and inline and block HTML following. + +All types of [HTML blocks] except type 7 may interrupt +a paragraph. Blocks of type 7 may not interrupt a paragraph. +(This restriction is intended to prevent unwanted interpretation +of long tags inside a wrapped paragraph as starting HTML blocks.) + +Some simple examples follow. Here are some basic HTML blocks +of type 6: + +```````````````````````````````` example + + + + +
      + hi +
      + +okay. +. + + + + +
      + hi +
      +

      okay.

      +```````````````````````````````` + + +```````````````````````````````` example +
      +*foo* +```````````````````````````````` + + +Here we have two HTML blocks with a Markdown paragraph between them: + +```````````````````````````````` example +
      + +*Markdown* + +
      +. +
      +

      Markdown

      +
      +```````````````````````````````` + + +The tag on the first line can be partial, as long +as it is split where there would be whitespace: + +```````````````````````````````` example +
      +
      +. +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +
      +
      +. +
      +
      +```````````````````````````````` + + +An open tag need not be closed: +```````````````````````````````` example +
      +*foo* + +*bar* +. +
      +*foo* +

      bar

      +```````````````````````````````` + + + +A partial tag need not even be completed (garbage +in, garbage out): + +```````````````````````````````` example +
      +. + +```````````````````````````````` + + +```````````````````````````````` example +
      +foo +
      +. +
      +foo +
      +```````````````````````````````` + + +Everything until the next blank line or end of document +gets included in the HTML block. So, in the following +example, what looks like a Markdown code block +is actually part of the HTML block, which continues until a blank +line or the end of the document is reached: + +```````````````````````````````` example +
      +``` c +int x = 33; +``` +. +
      +``` c +int x = 33; +``` +```````````````````````````````` + + +To start an [HTML block] with a tag that is *not* in the +list of block-level tags in (6), you must put the tag by +itself on the first line (and it must be complete): + +```````````````````````````````` example + +*bar* + +. + +*bar* + +```````````````````````````````` + + +In type 7 blocks, the [tag name] can be anything: + +```````````````````````````````` example + +*bar* + +. + +*bar* + +```````````````````````````````` + + +```````````````````````````````` example + +*bar* + +. + +*bar* + +```````````````````````````````` + + +```````````````````````````````` example + +*bar* +. + +*bar* +```````````````````````````````` + + +These rules are designed to allow us to work with tags that +can function as either block-level or inline-level tags. +The `` tag is a nice example. We can surround content with +`` tags in three different ways. In this case, we get a raw +HTML block, because the `` tag is on a line by itself: + +```````````````````````````````` example + +*foo* + +. + +*foo* + +```````````````````````````````` + + +In this case, we get a raw HTML block that just includes +the `` tag (because it ends with the following blank +line). So the contents get interpreted as CommonMark: + +```````````````````````````````` example + + +*foo* + + +. + +

      foo

      +
      +```````````````````````````````` + + +Finally, in this case, the `` tags are interpreted +as [raw HTML] *inside* the CommonMark paragraph. (Because +the tag is not on a line by itself, we get inline HTML +rather than an [HTML block].) + +```````````````````````````````` example +*foo* +. +

      foo

      +```````````````````````````````` + + +HTML tags designed to contain literal content +(`script`, `style`, `pre`), comments, processing instructions, +and declarations are treated somewhat differently. +Instead of ending at the first blank line, these blocks +end at the first line containing a corresponding end tag. +As a result, these blocks can contain blank lines: + +A pre tag (type 1): + +```````````````````````````````` example +
      
      +import Text.HTML.TagSoup
      +
      +main :: IO ()
      +main = print $ parseTags tags
      +
      +okay +. +
      
      +import Text.HTML.TagSoup
      +
      +main :: IO ()
      +main = print $ parseTags tags
      +
      +

      okay

      +```````````````````````````````` + + +A script tag (type 1): + +```````````````````````````````` example + +okay +. + +

      okay

      +```````````````````````````````` + + +A style tag (type 1): + +```````````````````````````````` example + +okay +. + +

      okay

      +```````````````````````````````` + + +If there is no matching end tag, the block will end at the +end of the document (or the enclosing [block quote][block quotes] +or [list item][list items]): + +```````````````````````````````` example + +*foo* +. + +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +*bar* +*baz* +. +*bar* +

      baz

      +```````````````````````````````` + + +Note that anything on the last line after the +end tag will be included in the [HTML block]: + +```````````````````````````````` example +1. *bar* +. +1. *bar* +```````````````````````````````` + + +A comment (type 2): + +```````````````````````````````` example + +okay +. + +

      okay

      +```````````````````````````````` + + + +A processing instruction (type 3): + +```````````````````````````````` example +'; + +?> +okay +. +'; + +?> +

      okay

      +```````````````````````````````` + + +A declaration (type 4): + +```````````````````````````````` example + +. + +```````````````````````````````` + + +CDATA (type 5): + +```````````````````````````````` example + +okay +. + +

      okay

      +```````````````````````````````` + + +The opening tag can be indented 1-3 spaces, but not 4: + +```````````````````````````````` example + + + +. + +
      <!-- foo -->
      +
      +```````````````````````````````` + + +```````````````````````````````` example +
      + +
      +. +
      +
      <div>
      +
      +```````````````````````````````` + + +An HTML block of types 1--6 can interrupt a paragraph, and need not be +preceded by a blank line. + +```````````````````````````````` example +Foo +
      +bar +
      +. +

      Foo

      +
      +bar +
      +```````````````````````````````` + + +However, a following blank line is needed, except at the end of +a document, and except for blocks of types 1--5, [above][HTML +block]: + +```````````````````````````````` example +
      +bar +
      +*foo* +. +
      +bar +
      +*foo* +```````````````````````````````` + + +HTML blocks of type 7 cannot interrupt a paragraph: + +```````````````````````````````` example +Foo + +baz +. +

      Foo + +baz

      +```````````````````````````````` + + +This rule differs from John Gruber's original Markdown syntax +specification, which says: + +> The only restrictions are that block-level HTML elements — +> e.g. `
      `, ``, `
      `, `

      `, etc. — must be separated from +> surrounding content by blank lines, and the start and end tags of the +> block should not be indented with tabs or spaces. + +In some ways Gruber's rule is more restrictive than the one given +here: + +- It requires that an HTML block be preceded by a blank line. +- It does not allow the start tag to be indented. +- It requires a matching end tag, which it also does not allow to + be indented. + +Most Markdown implementations (including some of Gruber's own) do not +respect all of these restrictions. + +There is one respect, however, in which Gruber's rule is more liberal +than the one given here, since it allows blank lines to occur inside +an HTML block. There are two reasons for disallowing them here. +First, it removes the need to parse balanced tags, which is +expensive and can require backtracking from the end of the document +if no matching end tag is found. Second, it provides a very simple +and flexible way of including Markdown content inside HTML tags: +simply separate the Markdown from the HTML using blank lines: + +Compare: + +```````````````````````````````` example +

      + +*Emphasized* text. + +
      +. +
      +

      Emphasized text.

      +
      +```````````````````````````````` + + +```````````````````````````````` example +
      +*Emphasized* text. +
      +. +
      +*Emphasized* text. +
      +```````````````````````````````` + + +Some Markdown implementations have adopted a convention of +interpreting content inside tags as text if the open tag has +the attribute `markdown=1`. The rule given above seems a simpler and +more elegant way of achieving the same expressive power, which is also +much simpler to parse. + +The main potential drawback is that one can no longer paste HTML +blocks into Markdown documents with 100% reliability. However, +*in most cases* this will work fine, because the blank lines in +HTML are usually followed by HTML block tags. For example: + +```````````````````````````````` example +
      + + + + + + + +
      +Hi +
      +. + + + + +
      +Hi +
      +```````````````````````````````` + + +There are problems, however, if the inner tags are indented +*and* separated by spaces, as then they will be interpreted as +an indented code block: + +```````````````````````````````` example + + + + + + + + +
      + Hi +
      +. + + +
      <td>
      +  Hi
      +</td>
      +
      + +
      +```````````````````````````````` + + +Fortunately, blank lines are usually not necessary and can be +deleted. The exception is inside `
      ` tags, but as described
      +[above][HTML blocks], raw HTML blocks starting with `
      `
      +*can* contain blank lines.
      +
      +## Link reference definitions
      +
      +A [link reference definition](@)
      +consists of a [link label], indented up to three spaces, followed
      +by a colon (`:`), optional [whitespace] (including up to one
      +[line ending]), a [link destination],
      +optional [whitespace] (including up to one
      +[line ending]), and an optional [link
      +title], which if it is present must be separated
      +from the [link destination] by [whitespace].
      +No further [non-whitespace characters] may occur on the line.
      +
      +A [link reference definition]
      +does not correspond to a structural element of a document.  Instead, it
      +defines a label which can be used in [reference links]
      +and reference-style [images] elsewhere in the document.  [Link
      +reference definitions] can come either before or after the links that use
      +them.
      +
      +```````````````````````````````` example
      +[foo]: /url "title"
      +
      +[foo]
      +.
      +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example + [foo]: + /url + 'the title' + +[foo] +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +[Foo*bar\]]:my_(url) 'title (with parens)' + +[Foo*bar\]] +. +

      Foo*bar]

      +```````````````````````````````` + + +```````````````````````````````` example +[Foo bar]: + +'title' + +[Foo bar] +. +

      Foo bar

      +```````````````````````````````` + + +The title may extend over multiple lines: + +```````````````````````````````` example +[foo]: /url ' +title +line1 +line2 +' + +[foo] +. +

      foo

      +```````````````````````````````` + + +However, it may not contain a [blank line]: + +```````````````````````````````` example +[foo]: /url 'title + +with blank line' + +[foo] +. +

      [foo]: /url 'title

      +

      with blank line'

      +

      [foo]

      +```````````````````````````````` + + +The title may be omitted: + +```````````````````````````````` example +[foo]: +/url + +[foo] +. +

      foo

      +```````````````````````````````` + + +The link destination may not be omitted: + +```````````````````````````````` example +[foo]: + +[foo] +. +

      [foo]:

      +

      [foo]

      +```````````````````````````````` + + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +

      foo

      +```````````````````````````````` + +The title must be separated from the link destination by +whitespace: + +```````````````````````````````` example +[foo]: (baz) + +[foo] +. +

      [foo]: (baz)

      +

      [foo]

      +```````````````````````````````` + + +Both title and destination can contain backslash escapes +and literal backslashes: + +```````````````````````````````` example +[foo]: /url\bar\*baz "foo\"bar\baz" + +[foo] +. +

      foo

      +```````````````````````````````` + + +A link can come before its corresponding definition: + +```````````````````````````````` example +[foo] + +[foo]: url +. +

      foo

      +```````````````````````````````` + + +If there are several matching definitions, the first one takes +precedence: + +```````````````````````````````` example +[foo] + +[foo]: first +[foo]: second +. +

      foo

      +```````````````````````````````` + + +As noted in the section on [Links], matching of labels is +case-insensitive (see [matches]). + +```````````````````````````````` example +[FOO]: /url + +[Foo] +. +

      Foo

      +```````````````````````````````` + + +```````````````````````````````` example +[ΑΓΩ]: /φου + +[αγω] +. +

      αγω

      +```````````````````````````````` + + +Here is a link reference definition with no corresponding link. +It contributes nothing to the document. + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + + +Here is another one: + +```````````````````````````````` example +[ +foo +]: /url +bar +. +

      bar

      +```````````````````````````````` + + +This is not a link reference definition, because there are +[non-whitespace characters] after the title: + +```````````````````````````````` example +[foo]: /url "title" ok +. +

      [foo]: /url "title" ok

      +```````````````````````````````` + + +This is a link reference definition, but it has no title: + +```````````````````````````````` example +[foo]: /url +"title" ok +. +

      "title" ok

      +```````````````````````````````` + + +This is not a link reference definition, because it is indented +four spaces: + +```````````````````````````````` example + [foo]: /url "title" + +[foo] +. +
      [foo]: /url "title"
      +
      +

      [foo]

      +```````````````````````````````` + + +This is not a link reference definition, because it occurs inside +a code block: + +```````````````````````````````` example +``` +[foo]: /url +``` + +[foo] +. +
      [foo]: /url
      +
      +

      [foo]

      +```````````````````````````````` + + +A [link reference definition] cannot interrupt a paragraph. + +```````````````````````````````` example +Foo +[bar]: /baz + +[bar] +. +

      Foo +[bar]: /baz

      +

      [bar]

      +```````````````````````````````` + + +However, it can directly follow other block elements, such as headings +and thematic breaks, and it need not be followed by a blank line. + +```````````````````````````````` example +# [Foo] +[foo]: /url +> bar +. +

      Foo

      +
      +

      bar

      +
      +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +

      bar

      +

      foo

      +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] +. +

      === +foo

      +```````````````````````````````` + + +Several [link reference definitions] +can occur one after another, without intervening blank lines. + +```````````````````````````````` example +[foo]: /foo-url "foo" +[bar]: /bar-url + "bar" +[baz]: /baz-url + +[foo], +[bar], +[baz] +. +

      foo, +bar, +baz

      +```````````````````````````````` + + +[Link reference definitions] can occur +inside block containers, like lists and block quotations. They +affect the entire document, not just the container in which they +are defined: + +```````````````````````````````` example +[foo] + +> [foo]: /url +. +

      foo

      +
      +
      +```````````````````````````````` + + +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + + +## Paragraphs + +A sequence of non-blank lines that cannot be interpreted as other +kinds of blocks forms a [paragraph](@). +The contents of the paragraph are the result of parsing the +paragraph's raw content as inlines. The paragraph's raw content +is formed by concatenating the lines and removing initial and final +[whitespace]. + +A simple example with two paragraphs: + +```````````````````````````````` example +aaa + +bbb +. +

      aaa

      +

      bbb

      +```````````````````````````````` + + +Paragraphs can contain multiple lines, but no blank lines: + +```````````````````````````````` example +aaa +bbb + +ccc +ddd +. +

      aaa +bbb

      +

      ccc +ddd

      +```````````````````````````````` + + +Multiple blank lines between paragraph have no effect: + +```````````````````````````````` example +aaa + + +bbb +. +

      aaa

      +

      bbb

      +```````````````````````````````` + + +Leading spaces are skipped: + +```````````````````````````````` example + aaa + bbb +. +

      aaa +bbb

      +```````````````````````````````` + + +Lines after the first may be indented any amount, since indented +code blocks cannot interrupt paragraphs. + +```````````````````````````````` example +aaa + bbb + ccc +. +

      aaa +bbb +ccc

      +```````````````````````````````` + + +However, the first line may be indented at most three spaces, +or an indented code block will be triggered: + +```````````````````````````````` example + aaa +bbb +. +

      aaa +bbb

      +```````````````````````````````` + + +```````````````````````````````` example + aaa +bbb +. +
      aaa
      +
      +

      bbb

      +```````````````````````````````` + + +Final spaces are stripped before inline parsing, so a paragraph +that ends with two or more spaces will not end with a [hard line +break]: + +```````````````````````````````` example +aaa +bbb +. +

      aaa
      +bbb

      +```````````````````````````````` + + +## Blank lines + +[Blank lines] between block-level elements are ignored, +except for the role they play in determining whether a [list] +is [tight] or [loose]. + +Blank lines at the beginning and end of the document are also ignored. + +```````````````````````````````` example + + +aaa + + +# aaa + + +. +

      aaa

      +

      aaa

      +```````````````````````````````` + + + +# Container blocks + +A [container block](#container-blocks) is a block that has other +blocks as its contents. There are two basic kinds of container blocks: +[block quotes] and [list items]. +[Lists] are meta-containers for [list items]. + +We define the syntax for container blocks recursively. The general +form of the definition is: + +> If X is a sequence of blocks, then the result of +> transforming X in such-and-such a way is a container of type Y +> with these blocks as its content. + +So, we explain what counts as a block quote or list item by explaining +how these can be *generated* from their contents. This should suffice +to define the syntax, although it does not give a recipe for *parsing* +these constructions. (A recipe is provided below in the section entitled +[A parsing strategy](#appendix-a-parsing-strategy).) + +## Block quotes + +A [block quote marker](@) +consists of 0-3 spaces of initial indent, plus (a) the character `>` together +with a following space, or (b) a single character `>` not followed by a space. + +The following rules define [block quotes]: + +1. **Basic case.** If a string of lines *Ls* constitute a sequence + of blocks *Bs*, then the result of prepending a [block quote + marker] to the beginning of each line in *Ls* + is a [block quote](#block-quotes) containing *Bs*. + +2. **Laziness.** If a string of lines *Ls* constitute a [block + quote](#block-quotes) with contents *Bs*, then the result of deleting + the initial [block quote marker] from one or + more lines in which the next [non-whitespace character] after the [block + quote marker] is [paragraph continuation + text] is a block quote with *Bs* as its content. + [Paragraph continuation text](@) is text + that will be parsed as part of the content of a paragraph, but does + not occur at the beginning of the paragraph. + +3. **Consecutiveness.** A document cannot contain two [block + quotes] in a row unless there is a [blank line] between them. + +Nothing else counts as a [block quote](#block-quotes). + +Here is a simple example: + +```````````````````````````````` example +> # Foo +> bar +> baz +. +
      +

      Foo

      +

      bar +baz

      +
      +```````````````````````````````` + + +The spaces after the `>` characters can be omitted: + +```````````````````````````````` example +># Foo +>bar +> baz +. +
      +

      Foo

      +

      bar +baz

      +
      +```````````````````````````````` + + +The `>` characters can be indented 1-3 spaces: + +```````````````````````````````` example + > # Foo + > bar + > baz +. +
      +

      Foo

      +

      bar +baz

      +
      +```````````````````````````````` + + +Four spaces gives us a code block: + +```````````````````````````````` example + > # Foo + > bar + > baz +. +
      > # Foo
      +> bar
      +> baz
      +
      +```````````````````````````````` + + +The Laziness clause allows us to omit the `>` before +[paragraph continuation text]: + +```````````````````````````````` example +> # Foo +> bar +baz +. +
      +

      Foo

      +

      bar +baz

      +
      +```````````````````````````````` + + +A block quote can contain some lazy and some non-lazy +continuation lines: + +```````````````````````````````` example +> bar +baz +> foo +. +
      +

      bar +baz +foo

      +
      +```````````````````````````````` + + +Laziness only applies to lines that would have been continuations of +paragraphs had they been prepended with [block quote markers]. +For example, the `> ` cannot be omitted in the second line of + +``` markdown +> foo +> --- +``` + +without changing the meaning: + +```````````````````````````````` example +> foo +--- +. +
      +

      foo

      +
      +
      +```````````````````````````````` + + +Similarly, if we omit the `> ` in the second line of + +``` markdown +> - foo +> - bar +``` + +then the block quote ends after the first line: + +```````````````````````````````` example +> - foo +- bar +. +
      +
        +
      • foo
      • +
      +
      +
        +
      • bar
      • +
      +```````````````````````````````` + + +For the same reason, we can't omit the `> ` in front of +subsequent lines of an indented or fenced code block: + +```````````````````````````````` example +> foo + bar +. +
      +
      foo
      +
      +
      +
      bar
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> ``` +foo +``` +. +
      +
      +
      +

      foo

      +
      +```````````````````````````````` + + +Note that in the following case, we have a [lazy +continuation line]: + +```````````````````````````````` example +> foo + - bar +. +
      +

      foo +- bar

      +
      +```````````````````````````````` + + +To see why, note that in + +```markdown +> foo +> - bar +``` + +the `- bar` is indented too far to start a list, and can't +be an indented code block because indented code blocks cannot +interrupt paragraphs, so it is [paragraph continuation text]. + +A block quote can be empty: + +```````````````````````````````` example +> +. +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> +> +> +. +
      +
      +```````````````````````````````` + + +A block quote can have initial or final blank lines: + +```````````````````````````````` example +> +> foo +> +. +
      +

      foo

      +
      +```````````````````````````````` + + +A blank line always separates block quotes: + +```````````````````````````````` example +> foo + +> bar +. +
      +

      foo

      +
      +
      +

      bar

      +
      +```````````````````````````````` + + +(Most current Markdown implementations, including John Gruber's +original `Markdown.pl`, will parse this example as a single block quote +with two paragraphs. But it seems better to allow the author to decide +whether two block quotes or one are wanted.) + +Consecutiveness means that if we put these block quotes together, +we get a single block quote: + +```````````````````````````````` example +> foo +> bar +. +
      +

      foo +bar

      +
      +```````````````````````````````` + + +To get a block quote with two paragraphs, use: + +```````````````````````````````` example +> foo +> +> bar +. +
      +

      foo

      +

      bar

      +
      +```````````````````````````````` + + +Block quotes can interrupt paragraphs: + +```````````````````````````````` example +foo +> bar +. +

      foo

      +
      +

      bar

      +
      +```````````````````````````````` + + +In general, blank lines are not needed before or after block +quotes: + +```````````````````````````````` example +> aaa +*** +> bbb +. +
      +

      aaa

      +
      +
      +
      +

      bbb

      +
      +```````````````````````````````` + + +However, because of laziness, a blank line is needed between +a block quote and a following paragraph: + +```````````````````````````````` example +> bar +baz +. +
      +

      bar +baz

      +
      +```````````````````````````````` + + +```````````````````````````````` example +> bar + +baz +. +
      +

      bar

      +
      +

      baz

      +```````````````````````````````` + + +```````````````````````````````` example +> bar +> +baz +. +
      +

      bar

      +
      +

      baz

      +```````````````````````````````` + + +It is a consequence of the Laziness rule that any number +of initial `>`s may be omitted on a continuation line of a +nested block quote: + +```````````````````````````````` example +> > > foo +bar +. +
      +
      +
      +

      foo +bar

      +
      +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +>>> foo +> bar +>>baz +. +
      +
      +
      +

      foo +bar +baz

      +
      +
      +
      +```````````````````````````````` + + +When including an indented code block in a block quote, +remember that the [block quote marker] includes +both the `>` and a following space. So *five spaces* are needed after +the `>`: + +```````````````````````````````` example +> code + +> not code +. +
      +
      code
      +
      +
      +
      +

      not code

      +
      +```````````````````````````````` + + + +## List items + +A [list marker](@) is a +[bullet list marker] or an [ordered list marker]. + +A [bullet list marker](@) +is a `-`, `+`, or `*` character. + +An [ordered list marker](@) +is a sequence of 1--9 arabic digits (`0-9`), followed by either a +`.` character or a `)` character. (The reason for the length +limit is that with 10 digits we start seeing integer overflows +in some browsers.) + +The following rules define [list items]: + +1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of + blocks *Bs* starting with a [non-whitespace character], and *M* is a + list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result + of prepending *M* and the following spaces to the first line of + *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a + list item with *Bs* as its contents. The type of the list item + (bullet or ordered) is determined by the type of its list marker. + If the list item is ordered, then it is also assigned a start + number, based on the ordered list marker. + + Exceptions: + + 1. When the first list item in a [list] interrupts + a paragraph---that is, when it starts on a line that would + otherwise count as [paragraph continuation text]---then (a) + the lines *Ls* must not begin with a blank line, and (b) if + the list item is ordered, the start number must be 1. + 2. If any line is a [thematic break][thematic breaks] then + that line is not a list item. + +For example, let *Ls* be the lines + +```````````````````````````````` example +A paragraph +with two lines. + + indented code + +> A block quote. +. +

      A paragraph +with two lines.

      +
      indented code
      +
      +
      +

      A block quote.

      +
      +```````````````````````````````` + + +And let *M* be the marker `1.`, and *N* = 2. Then rule #1 says +that the following is an ordered list item with start number 1, +and the same contents as *Ls*: + +```````````````````````````````` example +1. A paragraph + with two lines. + + indented code + + > A block quote. +. +
        +
      1. +

        A paragraph +with two lines.

        +
        indented code
        +
        +
        +

        A block quote.

        +
        +
      2. +
      +```````````````````````````````` + + +The most important thing to notice is that the position of +the text after the list marker determines how much indentation +is needed in subsequent blocks in the list item. If the list +marker takes up two spaces, and there are three spaces between +the list marker and the next [non-whitespace character], then blocks +must be indented five spaces in order to fall under the list +item. + +Here are some examples showing how far content must be indented to be +put under the list item: + +```````````````````````````````` example +- one + + two +. +
        +
      • one
      • +
      +

      two

      +```````````````````````````````` + + +```````````````````````````````` example +- one + + two +. +
        +
      • +

        one

        +

        two

        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example + - one + + two +. +
        +
      • one
      • +
      +
       two
      +
      +```````````````````````````````` + + +```````````````````````````````` example + - one + + two +. +
        +
      • +

        one

        +

        two

        +
      • +
      +```````````````````````````````` + + +It is tempting to think of this in terms of columns: the continuation +blocks must be indented at least to the column of the first +[non-whitespace character] after the list marker. However, that is not quite right. +The spaces after the list marker determine how much relative indentation +is needed. Which column this indentation reaches will depend on +how the list item is embedded in other constructions, as shown by +this example: + +```````````````````````````````` example + > > 1. one +>> +>> two +. +
      +
      +
        +
      1. +

        one

        +

        two

        +
      2. +
      +
      +
      +```````````````````````````````` + + +Here `two` occurs in the same column as the list marker `1.`, +but is actually contained in the list item, because there is +sufficient indentation after the last containing blockquote marker. + +The converse is also possible. In the following example, the word `two` +occurs far to the right of the initial text of the list item, `one`, but +it is not considered part of the list item, because it is not indented +far enough past the blockquote marker: + +```````````````````````````````` example +>>- one +>> + > > two +. +
      +
      +
        +
      • one
      • +
      +

      two

      +
      +
      +```````````````````````````````` + + +Note that at least one space is needed between the list marker and +any following content, so these are not list items: + +```````````````````````````````` example +-one + +2.two +. +

      -one

      +

      2.two

      +```````````````````````````````` + + +A list item may contain blocks that are separated by more than +one blank line. + +```````````````````````````````` example +- foo + + + bar +. +
        +
      • +

        foo

        +

        bar

        +
      • +
      +```````````````````````````````` + + +A list item may contain any kind of block: + +```````````````````````````````` example +1. foo + + ``` + bar + ``` + + baz + + > bam +. +
        +
      1. +

        foo

        +
        bar
        +
        +

        baz

        +
        +

        bam

        +
        +
      2. +
      +```````````````````````````````` + + +A list item that contains an indented code block will preserve +empty lines within the code block verbatim. + +```````````````````````````````` example +- Foo + + bar + + + baz +. +
        +
      • +

        Foo

        +
        bar
        +
        +
        +baz
        +
        +
      • +
      +```````````````````````````````` + +Note that ordered list start numbers must be nine digits or less: + +```````````````````````````````` example +123456789. ok +. +
        +
      1. ok
      2. +
      +```````````````````````````````` + + +```````````````````````````````` example +1234567890. not ok +. +

      1234567890. not ok

      +```````````````````````````````` + + +A start number may begin with 0s: + +```````````````````````````````` example +0. ok +. +
        +
      1. ok
      2. +
      +```````````````````````````````` + + +```````````````````````````````` example +003. ok +. +
        +
      1. ok
      2. +
      +```````````````````````````````` + + +A start number may not be negative: + +```````````````````````````````` example +-1. not ok +. +

      -1. not ok

      +```````````````````````````````` + + + +2. **Item starting with indented code.** If a sequence of lines *Ls* + constitute a sequence of blocks *Bs* starting with an indented code + block, and *M* is a list marker of width *W* followed by + one space, then the result of prepending *M* and the following + space to the first line of *Ls*, and indenting subsequent lines of + *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +An indented code block will have to be indented four spaces beyond +the edge of the region where text will be included in the list item. +In the following case that is 6 spaces: + +```````````````````````````````` example +- foo + + bar +. +
        +
      • +

        foo

        +
        bar
        +
        +
      • +
      +```````````````````````````````` + + +And in this case it is 11 spaces: + +```````````````````````````````` example + 10. foo + + bar +. +
        +
      1. +

        foo

        +
        bar
        +
        +
      2. +
      +```````````````````````````````` + + +If the *first* block in the list item is an indented code block, +then by rule #2, the contents must be indented *one* space after the +list marker: + +```````````````````````````````` example + indented code + +paragraph + + more code +. +
      indented code
      +
      +

      paragraph

      +
      more code
      +
      +```````````````````````````````` + + +```````````````````````````````` example +1. indented code + + paragraph + + more code +. +
        +
      1. +
        indented code
        +
        +

        paragraph

        +
        more code
        +
        +
      2. +
      +```````````````````````````````` + + +Note that an additional space indent is interpreted as space +inside the code block: + +```````````````````````````````` example +1. indented code + + paragraph + + more code +. +
        +
      1. +
         indented code
        +
        +

        paragraph

        +
        more code
        +
        +
      2. +
      +```````````````````````````````` + + +Note that rules #1 and #2 only apply to two cases: (a) cases +in which the lines to be included in a list item begin with a +[non-whitespace character], and (b) cases in which +they begin with an indented code +block. In a case like the following, where the first block begins with +a three-space indent, the rules do not allow us to form a list item by +indenting the whole thing and prepending a list marker: + +```````````````````````````````` example + foo + +bar +. +

      foo

      +

      bar

      +```````````````````````````````` + + +```````````````````````````````` example +- foo + + bar +. +
        +
      • foo
      • +
      +

      bar

      +```````````````````````````````` + + +This is not a significant restriction, because when a block begins +with 1-3 spaces indent, the indentation can always be removed without +a change in interpretation, allowing rule #1 to be applied. So, in +the above case: + +```````````````````````````````` example +- foo + + bar +. +
        +
      • +

        foo

        +

        bar

        +
      • +
      +```````````````````````````````` + + +3. **Item starting with a blank line.** If a sequence of lines *Ls* + starting with a single [blank line] constitute a (possibly empty) + sequence of blocks *Bs*, not separated from each other by more than + one blank line, and *M* is a list marker of width *W*, + then the result of prepending *M* to the first line of *Ls*, and + indenting subsequent lines of *Ls* by *W + 1* spaces, is a list + item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +Here are some list items that start with a blank line but are not empty: + +```````````````````````````````` example +- + foo +- + ``` + bar + ``` +- + baz +. +
        +
      • foo
      • +
      • +
        bar
        +
        +
      • +
      • +
        baz
        +
        +
      • +
      +```````````````````````````````` + +When the list item starts with a blank line, the number of spaces +following the list marker doesn't change the required indentation: + +```````````````````````````````` example +- + foo +. +
        +
      • foo
      • +
      +```````````````````````````````` + + +A list item can begin with at most one blank line. +In the following example, `foo` is not part of the list +item: + +```````````````````````````````` example +- + + foo +. +
        +
      • +
      +

      foo

      +```````````````````````````````` + + +Here is an empty bullet list item: + +```````````````````````````````` example +- foo +- +- bar +. +
        +
      • foo
      • +
      • +
      • bar
      • +
      +```````````````````````````````` + + +It does not matter whether there are spaces following the [list marker]: + +```````````````````````````````` example +- foo +- +- bar +. +
        +
      • foo
      • +
      • +
      • bar
      • +
      +```````````````````````````````` + + +Here is an empty ordered list item: + +```````````````````````````````` example +1. foo +2. +3. bar +. +
        +
      1. foo
      2. +
      3. +
      4. bar
      5. +
      +```````````````````````````````` + + +A list may start or end with an empty list item: + +```````````````````````````````` example +* +. +
        +
      • +
      +```````````````````````````````` + +However, an empty list item cannot interrupt a paragraph: + +```````````````````````````````` example +foo +* + +foo +1. +. +

      foo +*

      +

      foo +1.

      +```````````````````````````````` + + +4. **Indentation.** If a sequence of lines *Ls* constitutes a list item + according to rule #1, #2, or #3, then the result of indenting each line + of *Ls* by 1-3 spaces (the same for each line) also constitutes a + list item with the same contents and attributes. If a line is + empty, then it need not be indented. + +Indented one space: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +
        +
      1. +

        A paragraph +with two lines.

        +
        indented code
        +
        +
        +

        A block quote.

        +
        +
      2. +
      +```````````````````````````````` + + +Indented two spaces: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +
        +
      1. +

        A paragraph +with two lines.

        +
        indented code
        +
        +
        +

        A block quote.

        +
        +
      2. +
      +```````````````````````````````` + + +Indented three spaces: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +
        +
      1. +

        A paragraph +with two lines.

        +
        indented code
        +
        +
        +

        A block quote.

        +
        +
      2. +
      +```````````````````````````````` + + +Four spaces indent gives a code block: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +
      1.  A paragraph
      +    with two lines.
      +
      +        indented code
      +
      +    > A block quote.
      +
      +```````````````````````````````` + + + +5. **Laziness.** If a string of lines *Ls* constitute a [list + item](#list-items) with contents *Bs*, then the result of deleting + some or all of the indentation from one or more lines in which the + next [non-whitespace character] after the indentation is + [paragraph continuation text] is a + list item with the same contents and attributes. The unindented + lines are called + [lazy continuation line](@)s. + +Here is an example with [lazy continuation lines]: + +```````````````````````````````` example + 1. A paragraph +with two lines. + + indented code + + > A block quote. +. +
        +
      1. +

        A paragraph +with two lines.

        +
        indented code
        +
        +
        +

        A block quote.

        +
        +
      2. +
      +```````````````````````````````` + + +Indentation can be partially deleted: + +```````````````````````````````` example + 1. A paragraph + with two lines. +. +
        +
      1. A paragraph +with two lines.
      2. +
      +```````````````````````````````` + + +These examples show how laziness can work in nested structures: + +```````````````````````````````` example +> 1. > Blockquote +continued here. +. +
      +
        +
      1. +
        +

        Blockquote +continued here.

        +
        +
      2. +
      +
      +```````````````````````````````` + + +```````````````````````````````` example +> 1. > Blockquote +> continued here. +. +
      +
        +
      1. +
        +

        Blockquote +continued here.

        +
        +
      2. +
      +
      +```````````````````````````````` + + + +6. **That's all.** Nothing that is not counted as a list item by rules + #1--5 counts as a [list item](#list-items). + +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces a paragraph would need to be in order to be included +in the list item. + +So, in this case we need two spaces indent: + +```````````````````````````````` example +- foo + - bar + - baz + - boo +. +
        +
      • foo +
          +
        • bar +
            +
          • baz +
              +
            • boo
            • +
            +
          • +
          +
        • +
        +
      • +
      +```````````````````````````````` + + +One is not enough: + +```````````````````````````````` example +- foo + - bar + - baz + - boo +. +
        +
      • foo
      • +
      • bar
      • +
      • baz
      • +
      • boo
      • +
      +```````````````````````````````` + + +Here we need four, because the list marker is wider: + +```````````````````````````````` example +10) foo + - bar +. +
        +
      1. foo +
          +
        • bar
        • +
        +
      2. +
      +```````````````````````````````` + + +Three is not enough: + +```````````````````````````````` example +10) foo + - bar +. +
        +
      1. foo
      2. +
      +
        +
      • bar
      • +
      +```````````````````````````````` + + +A list may be the first block in a list item: + +```````````````````````````````` example +- - foo +. +
        +
      • +
          +
        • foo
        • +
        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +1. - 2. foo +. +
        +
      1. +
          +
        • +
            +
          1. foo
          2. +
          +
        • +
        +
      2. +
      +```````````````````````````````` + + +A list item can contain a heading: + +```````````````````````````````` example +- # Foo +- Bar + --- + baz +. +
        +
      • +

        Foo

        +
      • +
      • +

        Bar

        +baz
      • +
      +```````````````````````````````` + + +### Motivation + +John Gruber's Markdown spec says the following about list items: + +1. "List markers typically start at the left margin, but may be indented + by up to three spaces. List markers must be followed by one or more + spaces or a tab." + +2. "To make lists look nice, you can wrap items with hanging indents.... + But if you don't want to, you don't have to." + +3. "List items may consist of multiple paragraphs. Each subsequent + paragraph in a list item must be indented by either 4 spaces or one + tab." + +4. "It looks nice if you indent every line of the subsequent paragraphs, + but here again, Markdown will allow you to be lazy." + +5. "To put a blockquote within a list item, the blockquote's `>` + delimiters need to be indented." + +6. "To put a code block within a list item, the code block needs to be + indented twice — 8 spaces or two tabs." + +These rules specify that a paragraph under a list item must be indented +four spaces (presumably, from the left margin, rather than the start of +the list marker, but this is not said), and that code under a list item +must be indented eight spaces instead of the usual four. They also say +that a block quote must be indented, but not by how much; however, the +example given has four spaces indentation. Although nothing is said +about other kinds of block-level content, it is certainly reasonable to +infer that *all* block elements under a list item, including other +lists, must be indented four spaces. This principle has been called the +*four-space rule*. + +The four-space rule is clear and principled, and if the reference +implementation `Markdown.pl` had followed it, it probably would have +become the standard. However, `Markdown.pl` allowed paragraphs and +sublists to start with only two spaces indentation, at least on the +outer level. Worse, its behavior was inconsistent: a sublist of an +outer-level list needed two spaces indentation, but a sublist of this +sublist needed three spaces. It is not surprising, then, that different +implementations of Markdown have developed very different rules for +determining what comes under a list item. (Pandoc and python-Markdown, +for example, stuck with Gruber's syntax description and the four-space +rule, while discount, redcarpet, marked, PHP Markdown, and others +followed `Markdown.pl`'s behavior more closely.) + +Unfortunately, given the divergences between implementations, there +is no way to give a spec for list items that will be guaranteed not +to break any existing documents. However, the spec given here should +correctly handle lists formatted with either the four-space rule or +the more forgiving `Markdown.pl` behavior, provided they are laid out +in a way that is natural for a human to read. + +The strategy here is to let the width and indentation of the list marker +determine the indentation necessary for blocks to fall under the list +item, rather than having a fixed and arbitrary number. The writer can +think of the body of the list item as a unit which gets indented to the +right enough to fit the list marker (and any indentation on the list +marker). (The laziness rule, #5, then allows continuation lines to be +unindented if needed.) + +This rule is superior, we claim, to any rule requiring a fixed level of +indentation from the margin. The four-space rule is clear but +unnatural. It is quite unintuitive that + +``` markdown +- foo + + bar + + - baz +``` + +should be parsed as two lists with an intervening paragraph, + +``` html +
        +
      • foo
      • +
      +

      bar

      +
        +
      • baz
      • +
      +``` + +as the four-space rule demands, rather than a single list, + +``` html +
        +
      • +

        foo

        +

        bar

        +
          +
        • baz
        • +
        +
      • +
      +``` + +The choice of four spaces is arbitrary. It can be learned, but it is +not likely to be guessed, and it trips up beginners regularly. + +Would it help to adopt a two-space rule? The problem is that such +a rule, together with the rule allowing 1--3 spaces indentation of the +initial list marker, allows text that is indented *less than* the +original list marker to be included in the list item. For example, +`Markdown.pl` parses + +``` markdown + - one + + two +``` + +as a single list item, with `two` a continuation paragraph: + +``` html +
        +
      • +

        one

        +

        two

        +
      • +
      +``` + +and similarly + +``` markdown +> - one +> +> two +``` + +as + +``` html +
      +
        +
      • +

        one

        +

        two

        +
      • +
      +
      +``` + +This is extremely unintuitive. + +Rather than requiring a fixed indent from the margin, we could require +a fixed indent (say, two spaces, or even one space) from the list marker (which +may itself be indented). This proposal would remove the last anomaly +discussed. Unlike the spec presented above, it would count the following +as a list item with a subparagraph, even though the paragraph `bar` +is not indented as far as the first paragraph `foo`: + +``` markdown + 10. foo + + bar +``` + +Arguably this text does read like a list item with `bar` as a subparagraph, +which may count in favor of the proposal. However, on this proposal indented +code would have to be indented six spaces after the list marker. And this +would break a lot of existing Markdown, which has the pattern: + +``` markdown +1. foo + + indented code +``` + +where the code is indented eight spaces. The spec above, by contrast, will +parse this text as expected, since the code block's indentation is measured +from the beginning of `foo`. + +The one case that needs special treatment is a list item that *starts* +with indented code. How much indentation is required in that case, since +we don't have a "first paragraph" to measure from? Rule #2 simply stipulates +that in such cases, we require one space indentation from the list marker +(and then the normal four spaces for the indented code). This will match the +four-space rule in cases where the list marker plus its initial indentation +takes four spaces (a common case), but diverge in other cases. + +## Lists + +A [list](@) is a sequence of one or more +list items [of the same type]. The list items +may be separated by any number of blank lines. + +Two list items are [of the same type](@) +if they begin with a [list marker] of the same type. +Two list markers are of the +same type if (a) they are bullet list markers using the same character +(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same +delimiter (either `.` or `)`). + +A list is an [ordered list](@) +if its constituent list items begin with +[ordered list markers], and a +[bullet list](@) if its constituent list +items begin with [bullet list markers]. + +The [start number](@) +of an [ordered list] is determined by the list number of +its initial list item. The numbers of subsequent list items are +disregarded. + +A list is [loose](@) if any of its constituent +list items are separated by blank lines, or if any of its constituent +list items directly contain two block-level elements with a blank line +between them. Otherwise a list is [tight](@). +(The difference in HTML output is that paragraphs in a loose list are +wrapped in `

      ` tags, while paragraphs in a tight list are not.) + +Changing the bullet or ordered list delimiter starts a new list: + +```````````````````````````````` example +- foo +- bar ++ baz +. +

        +
      • foo
      • +
      • bar
      • +
      +
        +
      • baz
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +1. foo +2. bar +3) baz +. +
        +
      1. foo
      2. +
      3. bar
      4. +
      +
        +
      1. baz
      2. +
      +```````````````````````````````` + + +In CommonMark, a list can interrupt a paragraph. That is, +no blank line is needed to separate a paragraph from a following +list: + +```````````````````````````````` example +Foo +- bar +- baz +. +

      Foo

      +
        +
      • bar
      • +
      • baz
      • +
      +```````````````````````````````` + +`Markdown.pl` does not allow this, through fear of triggering a list +via a numeral in a hard-wrapped line: + +``` markdown +The number of windows in my house is +14. The number of doors is 6. +``` + +Oddly, though, `Markdown.pl` *does* allow a blockquote to +interrupt a paragraph, even though the same considerations might +apply. + +In CommonMark, we do allow lists to interrupt paragraphs, for +two reasons. First, it is natural and not uncommon for people +to start lists without blank lines: + +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` + +Second, we are attracted to a + +> [principle of uniformity](@): +> if a chunk of text has a certain +> meaning, it will continue to have the same meaning when put into a +> container block (such as a list item or blockquote). + +(Indeed, the spec for [list items] and [block quotes] presupposes +this principle.) This principle implies that if + +``` markdown + * I need to buy + - new shoes + - a coat + - a plane ticket +``` + +is a list item containing a paragraph followed by a nested sublist, +as all Markdown implementations agree it is (though the paragraph +may be rendered without `

      ` tags, since the list is "tight"), +then + +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` + +by itself should be a paragraph followed by a nested sublist. + +Since it is well established Markdown practice to allow lists to +interrupt paragraphs inside list items, the [principle of +uniformity] requires us to allow this outside list items as +well. ([reStructuredText](http://docutils.sourceforge.net/rst.html) +takes a different approach, requiring blank lines before lists +even inside other list items.) + +In order to solve of unwanted lists in paragraphs with +hard-wrapped numerals, we allow only lists starting with `1` to +interrupt paragraphs. Thus, + +```````````````````````````````` example +The number of windows in my house is +14. The number of doors is 6. +. +

      The number of windows in my house is +14. The number of doors is 6.

      +```````````````````````````````` + +We may still get an unintended result in cases like + +```````````````````````````````` example +The number of windows in my house is +1. The number of doors is 6. +. +

      The number of windows in my house is

      +
        +
      1. The number of doors is 6.
      2. +
      +```````````````````````````````` + +but this rule should prevent most spurious list captures. + +There can be any number of blank lines between items: + +```````````````````````````````` example +- foo + +- bar + + +- baz +. +
        +
      • +

        foo

        +
      • +
      • +

        bar

        +
      • +
      • +

        baz

        +
      • +
      +```````````````````````````````` + +```````````````````````````````` example +- foo + - bar + - baz + + + bim +. +
        +
      • foo +
          +
        • bar +
            +
          • +

            baz

            +

            bim

            +
          • +
          +
        • +
        +
      • +
      +```````````````````````````````` + + +To separate consecutive lists of the same type, or to separate a +list from an indented code block that would otherwise be parsed +as a subparagraph of the final list item, you can insert a blank HTML +comment: + +```````````````````````````````` example +- foo +- bar + + + +- baz +- bim +. +
        +
      • foo
      • +
      • bar
      • +
      + +
        +
      • baz
      • +
      • bim
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +- foo + + notcode + +- foo + + + + code +. +
        +
      • +

        foo

        +

        notcode

        +
      • +
      • +

        foo

        +
      • +
      + +
      code
      +
      +```````````````````````````````` + + +List items need not be indented to the same level. The following +list items will be treated as items at the same list level, +since none is indented enough to belong to the previous list +item: + +```````````````````````````````` example +- a + - b + - c + - d + - e + - f +- g +. +
        +
      • a
      • +
      • b
      • +
      • c
      • +
      • d
      • +
      • e
      • +
      • f
      • +
      • g
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +
        +
      1. +

        a

        +
      2. +
      3. +

        b

        +
      4. +
      5. +

        c

        +
      6. +
      +```````````````````````````````` + +Note, however, that list items may not be indented more than +three spaces. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +
        +
      • a
      • +
      • b
      • +
      • c
      • +
      • d +- e
      • +
      +```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +
        +
      1. +

        a

        +
      2. +
      3. +

        b

        +
      4. +
      +
      3. c
      +
      +```````````````````````````````` + + +This is a loose list, because there is a blank line between +two of the list items: + +```````````````````````````````` example +- a +- b + +- c +. +
        +
      • +

        a

        +
      • +
      • +

        b

        +
      • +
      • +

        c

        +
      • +
      +```````````````````````````````` + + +So is this, with a empty second item: + +```````````````````````````````` example +* a +* + +* c +. +
        +
      • +

        a

        +
      • +
      • +
      • +

        c

        +
      • +
      +```````````````````````````````` + + +These are loose lists, even though there is no space between the items, +because one of the items directly contains two block-level elements +with a blank line between them: + +```````````````````````````````` example +- a +- b + + c +- d +. +
        +
      • +

        a

        +
      • +
      • +

        b

        +

        c

        +
      • +
      • +

        d

        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +- a +- b + + [ref]: /url +- d +. +
        +
      • +

        a

        +
      • +
      • +

        b

        +
      • +
      • +

        d

        +
      • +
      +```````````````````````````````` + + +This is a tight list, because the blank lines are in a code block: + +```````````````````````````````` example +- a +- ``` + b + + + ``` +- c +. +
        +
      • a
      • +
      • +
        b
        +
        +
        +
        +
      • +
      • c
      • +
      +```````````````````````````````` + + +This is a tight list, because the blank line is between two +paragraphs of a sublist. So the sublist is loose while +the outer list is tight: + +```````````````````````````````` example +- a + - b + + c +- d +. +
        +
      • a +
          +
        • +

          b

          +

          c

          +
        • +
        +
      • +
      • d
      • +
      +```````````````````````````````` + + +This is a tight list, because the blank line is inside the +block quote: + +```````````````````````````````` example +* a + > b + > +* c +. +
        +
      • a +
        +

        b

        +
        +
      • +
      • c
      • +
      +```````````````````````````````` + + +This list is tight, because the consecutive block elements +are not separated by blank lines: + +```````````````````````````````` example +- a + > b + ``` + c + ``` +- d +. +
        +
      • a +
        +

        b

        +
        +
        c
        +
        +
      • +
      • d
      • +
      +```````````````````````````````` + + +A single-paragraph list is tight: + +```````````````````````````````` example +- a +. +
        +
      • a
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +- a + - b +. +
        +
      • a +
          +
        • b
        • +
        +
      • +
      +```````````````````````````````` + + +This list is loose, because of the blank line between the +two block elements in the list item: + +```````````````````````````````` example +1. ``` + foo + ``` + + bar +. +
        +
      1. +
        foo
        +
        +

        bar

        +
      2. +
      +```````````````````````````````` + + +Here the outer list is loose, the inner list tight: + +```````````````````````````````` example +* foo + * bar + + baz +. +
        +
      • +

        foo

        +
          +
        • bar
        • +
        +

        baz

        +
      • +
      +```````````````````````````````` + + +```````````````````````````````` example +- a + - b + - c + +- d + - e + - f +. +
        +
      • +

        a

        +
          +
        • b
        • +
        • c
        • +
        +
      • +
      • +

        d

        +
          +
        • e
        • +
        • f
        • +
        +
      • +
      +```````````````````````````````` + + +# Inlines + +Inlines are parsed sequentially from the beginning of the character +stream to the end (left to right, in left-to-right languages). +Thus, for example, in + +```````````````````````````````` example +`hi`lo` +. +

      hilo`

      +```````````````````````````````` + +`hi` is parsed as code, leaving the backtick at the end as a literal +backtick. + + +## Backslash escapes + +Any ASCII punctuation character may be backslash-escaped: + +```````````````````````````````` example +\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ +. +

      !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

      +```````````````````````````````` + + +Backslashes before other characters are treated as literal +backslashes: + +```````````````````````````````` example +\→\A\a\ \3\φ\« +. +

      \→\A\a\ \3\φ\«

      +```````````````````````````````` + + +Escaped characters are treated as regular characters and do +not have their usual Markdown meanings: + +```````````````````````````````` example +\*not emphasized* +\
      not a tag +\[not a link](/foo) +\`not code` +1\. not a list +\* not a list +\# not a heading +\[foo]: /url "not a reference" +\ö not a character entity +. +

      *not emphasized* +<br/> not a tag +[not a link](/foo) +`not code` +1. not a list +* not a list +# not a heading +[foo]: /url "not a reference" +&ouml; not a character entity

      +```````````````````````````````` + + +If a backslash is itself escaped, the following character is not: + +```````````````````````````````` example +\\*emphasis* +. +

      \emphasis

      +```````````````````````````````` + + +A backslash at the end of the line is a [hard line break]: + +```````````````````````````````` example +foo\ +bar +. +

      foo
      +bar

      +```````````````````````````````` + + +Backslash escapes do not work in code blocks, code spans, autolinks, or +raw HTML: + +```````````````````````````````` example +`` \[\` `` +. +

      \[\`

      +```````````````````````````````` + + +```````````````````````````````` example + \[\] +. +
      \[\]
      +
      +```````````````````````````````` + + +```````````````````````````````` example +~~~ +\[\] +~~~ +. +
      \[\]
      +
      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      http://example.com?find=\*

      +```````````````````````````````` + + +```````````````````````````````` example + +. + +```````````````````````````````` + + +But they work in all other contexts, including URLs and link titles, +link references, and [info strings] in [fenced code blocks]: + +```````````````````````````````` example +[foo](/bar\* "ti\*tle") +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +[foo] + +[foo]: /bar\* "ti\*tle" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +``` foo\+bar +foo +``` +. +
      foo
      +
      +```````````````````````````````` + + + +## Entity and numeric character references + +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: + +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. + +[Entity references](@) consist of `&` + any of the valid +HTML5 entity names + `;`. The +document +is used as an authoritative source for the valid entity +references and their corresponding code points. + +```````````````````````````````` example +  & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸ +. +

        & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸

      +```````````````````````````````` + + +[Decimal numeric character +references](@) +consist of `&#` + a string of 1--7 arabic digits + `;`. A +numeric character reference is parsed as the corresponding +Unicode character. Invalid Unicode code points will be replaced by +the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, +the code point `U+0000` will also be replaced by `U+FFFD`. + +```````````````````````````````` example +# Ӓ Ϡ � +. +

      # Ӓ Ϡ �

      +```````````````````````````````` + + +[Hexadecimal numeric character +references](@) consist of `&#` + +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. +They too are parsed as the corresponding Unicode character (this +time specified with a hexadecimal numeral instead of decimal). + +```````````````````````````````` example +" ആ ಫ +. +

      " ആ ಫ

      +```````````````````````````````` + + +Here are some nonentities: + +```````````````````````````````` example +  &x; &#; &#x; +� +&#abcdef0; +&ThisIsNotDefined; &hi?; +. +

      &nbsp &x; &#; &#x; +&#87654321; +&#abcdef0; +&ThisIsNotDefined; &hi?;

      +```````````````````````````````` + + +Although HTML5 does accept some entity references +without a trailing semicolon (such as `©`), these are not +recognized here, because it makes the grammar too ambiguous: + +```````````````````````````````` example +© +. +

      &copy

      +```````````````````````````````` + + +Strings that are not on the list of HTML5 named entities are not +recognized as entity references either: + +```````````````````````````````` example +&MadeUpEntity; +. +

      &MadeUpEntity;

      +```````````````````````````````` + + +Entity and numeric character references are recognized in any +context besides code spans or code blocks, including +URLs, [link titles], and [fenced code block][] [info strings]: + +```````````````````````````````` example + +. + +```````````````````````````````` + + +```````````````````````````````` example +[foo](/föö "föö") +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +[foo] + +[foo]: /föö "föö" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +``` föö +foo +``` +. +
      foo
      +
      +```````````````````````````````` + + +Entity and numeric character references are treated as literal +text in code spans and code blocks: + +```````````````````````````````` example +`föö` +. +

      f&ouml;&ouml;

      +```````````````````````````````` + + +```````````````````````````````` example + föfö +. +
      f&ouml;f&ouml;
      +
      +```````````````````````````````` + + +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +

      *foo* +foo

      +```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo +. +

      * foo

      +
        +
      • foo
      • +
      +```````````````````````````````` + +```````````````````````````````` example +foo bar +. +

      foo + +bar

      +```````````````````````````````` + +```````````````````````````````` example + foo +. +

      →foo

      +```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +

      [a](url "tit")

      +```````````````````````````````` + + +## Code spans + +A [backtick string](@) +is a string of one or more backtick characters (`` ` ``) that is neither +preceded nor followed by a backtick. + +A [code span](@) begins with a backtick string and ends with +a backtick string of equal length. The contents of the code span are +the characters between the two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. + +This is a simple code span: + +```````````````````````````````` example +`foo` +. +

      foo

      +```````````````````````````````` + + +Here two backticks are used, because the code contains a backtick. +This example also illustrates stripping of a single leading and +trailing space: + +```````````````````````````````` example +`` foo ` bar `` +. +

      foo ` bar

      +```````````````````````````````` + + +This example shows the motivation for stripping leading and trailing +spaces: + +```````````````````````````````` example +` `` ` +. +

      ``

      +```````````````````````````````` + +Note that only *one* space is stripped: + +```````````````````````````````` example +` `` ` +. +

      ``

      +```````````````````````````````` + +The stripping only happens if the space is on both +sides of the string: + +```````````````````````````````` example +` a` +. +

      a

      +```````````````````````````````` + +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +

       b 

      +```````````````````````````````` + +No stripping occurs if the code span contains only spaces: + +```````````````````````````````` example +` ` +` ` +. +

        +

      +```````````````````````````````` + + +[Line endings] are treated like spaces: + +```````````````````````````````` example +`` +foo +bar +baz +`` +. +

      foo bar baz

      +```````````````````````````````` + +```````````````````````````````` example +`` +foo +`` +. +

      foo

      +```````````````````````````````` + + +Interior spaces are not collapsed: + +```````````````````````````````` example +`foo bar +baz` +. +

      foo bar baz

      +```````````````````````````````` + +Note that browsers will typically collapse consecutive spaces +when rendering `` elements, so it is recommended that +the following CSS be used: + + code{white-space: pre-wrap;} + + +Note that backslash escapes do not work in code spans. All backslashes +are treated literally: + +```````````````````````````````` example +`foo\`bar` +. +

      foo\bar`

      +```````````````````````````````` + + +Backslash escapes are never needed, because one can always choose a +string of *n* backtick characters as delimiters, where the code does +not contain any strings of exactly *n* backtick characters. + +```````````````````````````````` example +``foo`bar`` +. +

      foo`bar

      +```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +

      foo `` bar

      +```````````````````````````````` + + +Code span backticks have higher precedence than any other inline +constructs except HTML tags and autolinks. Thus, for example, this is +not parsed as emphasized text, since the second `*` is part of a code +span: + +```````````````````````````````` example +*foo`*` +. +

      *foo*

      +```````````````````````````````` + + +And this is not parsed as a link: + +```````````````````````````````` example +[not a `link](/foo`) +. +

      [not a link](/foo)

      +```````````````````````````````` + + +Code spans, HTML tags, and autolinks have the same precedence. +Thus, this is code: + +```````````````````````````````` example +`` +. +

      <a href="">`

      +```````````````````````````````` + + +But this is an HTML tag: + +```````````````````````````````` example +
      ` +. +

      `

      +```````````````````````````````` + + +And this is code: + +```````````````````````````````` example +`` +. +

      <http://foo.bar.baz>`

      +```````````````````````````````` + + +But this is an autolink: + +```````````````````````````````` example +` +. +

      http://foo.bar.`baz`

      +```````````````````````````````` + + +When a backtick string is not closed by a matching backtick string, +we just have literal backticks: + +```````````````````````````````` example +```foo`` +. +

      ```foo``

      +```````````````````````````````` + + +```````````````````````````````` example +`foo +. +

      `foo

      +```````````````````````````````` + +The following case also illustrates the need for opening and +closing backtick strings to be equal in length: + +```````````````````````````````` example +`foo``bar`` +. +

      `foobar

      +```````````````````````````````` + + +## Emphasis and strong emphasis + +John Gruber's original [Markdown syntax +description](http://daringfireball.net/projects/markdown/syntax#em) says: + +> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML +> `` tag; double `*`'s or `_`'s will be wrapped with an HTML `` +> tag. + +This is enough for most users, but these rules leave much undecided, +especially when it comes to nested emphasis. The original +`Markdown.pl` test suite makes it clear that triple `***` and +`___` delimiters can be used for strong emphasis, and most +implementations have also allowed the following patterns: + +``` markdown +***strong emph*** +***strong** in emph* +***emph* in strong** +**in strong *emph*** +*in emph **strong*** +``` + +The following patterns are less widely supported, but the intent +is clear and they are useful (especially in contexts like bibliography +entries): + +``` markdown +*emph *with emph* in it* +**strong **with strong** in it** +``` + +Many implementations have also restricted intraword emphasis to +the `*` forms, to avoid unwanted emphasis in words containing +internal underscores. (It is best practice to put these in code +spans, but users often do not.) + +``` markdown +internal emphasis: foo*bar*baz +no emphasis: foo_bar_baz +``` + +The rules given below capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack. + +First, some definitions. A [delimiter run](@) is either +a sequence of one or more `*` characters that is not preceded or +followed by a non-backslash-escaped `*` character, or a sequence +of one or more `_` characters that is not preceded or followed by +a non-backslash-escaped `_` character. + +A [left-flanking delimiter run](@) is +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [punctuation character], or +(2b) followed by a [punctuation character] and +preceded by [Unicode whitespace] or a [punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as Unicode whitespace. + +A [right-flanking delimiter run](@) is +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [punctuation character], or +(2b) preceded by a [punctuation character] and +followed by [Unicode whitespace] or a [punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as Unicode whitespace. + +Here are some examples of delimiter runs. + + - left-flanking but not right-flanking: + + ``` + ***abc + _abc + **"abc" + _"abc" + ``` + + - right-flanking but not left-flanking: + + ``` + abc*** + abc_ + "abc"** + "abc"_ + ``` + + - Both left and right-flanking: + + ``` + abc***def + "abc"_"def" + ``` + + - Neither left nor right-flanking: + + ``` + abc *** def + a _ b + ``` + +(The idea of distinguishing left-flanking and right-flanking +delimiter runs based on the character before and the character +after comes from Roopesh Chander's +[vfmd](http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags). +vfmd uses the terminology "emphasis indicator string" instead of "delimiter +run," and its rules for distinguishing left- and right-flanking runs +are a bit more complex than the ones given here.) + +The following rules define emphasis and strong emphasis: + +1. A single `*` character [can open emphasis](@) + iff (if and only if) it is part of a [left-flanking delimiter run]. + +2. A single `_` character [can open emphasis] iff + it is part of a [left-flanking delimiter run] + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimiter run] + preceded by punctuation. + +3. A single `*` character [can close emphasis](@) + iff it is part of a [right-flanking delimiter run]. + +4. A single `_` character [can close emphasis] iff + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimiter run] + followed by punctuation. + +5. A double `**` [can open strong emphasis](@) + iff it is part of a [left-flanking delimiter run]. + +6. A double `__` [can open strong emphasis] iff + it is part of a [left-flanking delimiter run] + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimiter run] + preceded by punctuation. + +7. A double `**` [can close strong emphasis](@) + iff it is part of a [right-flanking delimiter run]. + +8. A double `__` [can close strong emphasis] iff + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimiter run] + followed by punctuation. + +9. Emphasis begins with a delimiter that [can open emphasis] and ends + with a delimiter that [can close emphasis], and that uses the same + character (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both + open and close emphasis, then the sum of the lengths of the + delimiter runs containing the opening and closing delimiters + must not be a multiple of 3 unless both lengths are + multiples of 3. + +10. Strong emphasis begins with a delimiter that + [can open strong emphasis] and ends with a delimiter that + [can close strong emphasis], and that uses the same character + (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both open + and close strong emphasis, then the sum of the lengths of + the delimiter runs containing the opening and closing + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. + +11. A literal `*` character cannot occur at the beginning or end of + `*`-delimited emphasis or `**`-delimited strong emphasis, unless it + is backslash-escaped. + +12. A literal `_` character cannot occur at the beginning or end of + `_`-delimited emphasis or `__`-delimited strong emphasis, unless it + is backslash-escaped. + +Where rules 1--12 above are compatible with multiple parsings, +the following principles resolve ambiguity: + +13. The number of nestings should be minimized. Thus, for example, + an interpretation `...` is always preferred to + `...`. + +14. An interpretation `...` is always + preferred to `...`. + +15. When two potential emphasis or strong emphasis spans overlap, + so that the second begins before the first ends and ends after + the first ends, the first takes precedence. Thus, for example, + `*foo _bar* baz_` is parsed as `foo _bar baz_` rather + than `*foo bar* baz`. + +16. When there are two potential emphasis or strong emphasis spans + with the same closing delimiter, the shorter one (the one that + opens later) takes precedence. Thus, for example, + `**foo **bar baz**` is parsed as `**foo bar baz` + rather than `foo **bar baz`. + +17. Inline code spans, links, images, and HTML tags group more tightly + than emphasis. So, when there is a choice between an interpretation + that contains one of these elements and one that does not, the + former always wins. Thus, for example, `*[foo*](bar)` is + parsed as `*foo*` rather than as + `[foo](bar)`. + +These rules can be illustrated through a series of examples. + +Rule 1: + +```````````````````````````````` example +*foo bar* +. +

      foo bar

      +```````````````````````````````` + + +This is not emphasis, because the opening `*` is followed by +whitespace, and hence not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a * foo bar* +. +

      a * foo bar*

      +```````````````````````````````` + + +This is not emphasis, because the opening `*` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a*"foo"* +. +

      a*"foo"*

      +```````````````````````````````` + + +Unicode nonbreaking spaces count as whitespace, too: + +```````````````````````````````` example +* a * +. +

      * a *

      +```````````````````````````````` + + +Intraword emphasis with `*` is permitted: + +```````````````````````````````` example +foo*bar* +. +

      foobar

      +```````````````````````````````` + + +```````````````````````````````` example +5*6*78 +. +

      5678

      +```````````````````````````````` + + +Rule 2: + +```````````````````````````````` example +_foo bar_ +. +

      foo bar

      +```````````````````````````````` + + +This is not emphasis, because the opening `_` is followed by +whitespace: + +```````````````````````````````` example +_ foo bar_ +. +

      _ foo bar_

      +```````````````````````````````` + + +This is not emphasis, because the opening `_` is preceded +by an alphanumeric and followed by punctuation: + +```````````````````````````````` example +a_"foo"_ +. +

      a_"foo"_

      +```````````````````````````````` + + +Emphasis with `_` is not allowed inside words: + +```````````````````````````````` example +foo_bar_ +. +

      foo_bar_

      +```````````````````````````````` + + +```````````````````````````````` example +5_6_78 +. +

      5_6_78

      +```````````````````````````````` + + +```````````````````````````````` example +пристаням_стремятся_ +. +

      пристаням_стремятся_

      +```````````````````````````````` + + +Here `_` does not generate emphasis, because the first delimiter run +is right-flanking and the second left-flanking: + +```````````````````````````````` example +aa_"bb"_cc +. +

      aa_"bb"_cc

      +```````````````````````````````` + + +This is emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: + +```````````````````````````````` example +foo-_(bar)_ +. +

      foo-(bar)

      +```````````````````````````````` + + +Rule 3: + +This is not emphasis, because the closing delimiter does +not match the opening delimiter: + +```````````````````````````````` example +_foo* +. +

      _foo*

      +```````````````````````````````` + + +This is not emphasis, because the closing `*` is preceded by +whitespace: + +```````````````````````````````` example +*foo bar * +. +

      *foo bar *

      +```````````````````````````````` + + +A newline also counts as whitespace: + +```````````````````````````````` example +*foo bar +* +. +

      *foo bar +*

      +```````````````````````````````` + + +This is not emphasis, because the second `*` is +preceded by punctuation and followed by an alphanumeric +(hence it is not part of a [right-flanking delimiter run]: + +```````````````````````````````` example +*(*foo) +. +

      *(*foo)

      +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with this example: + +```````````````````````````````` example +*(*foo*)* +. +

      (foo)

      +```````````````````````````````` + + +Intraword emphasis with `*` is allowed: + +```````````````````````````````` example +*foo*bar +. +

      foobar

      +```````````````````````````````` + + + +Rule 4: + +This is not emphasis, because the closing `_` is preceded by +whitespace: + +```````````````````````````````` example +_foo bar _ +. +

      _foo bar _

      +```````````````````````````````` + + +This is not emphasis, because the second `_` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +_(_foo) +. +

      _(_foo)

      +```````````````````````````````` + + +This is emphasis within emphasis: + +```````````````````````````````` example +_(_foo_)_ +. +

      (foo)

      +```````````````````````````````` + + +Intraword emphasis is disallowed for `_`: + +```````````````````````````````` example +_foo_bar +. +

      _foo_bar

      +```````````````````````````````` + + +```````````````````````````````` example +_пристаням_стремятся +. +

      _пристаням_стремятся

      +```````````````````````````````` + + +```````````````````````````````` example +_foo_bar_baz_ +. +

      foo_bar_baz

      +```````````````````````````````` + + +This is emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +```````````````````````````````` example +_(bar)_. +. +

      (bar).

      +```````````````````````````````` + + +Rule 5: + +```````````````````````````````` example +**foo bar** +. +

      foo bar

      +```````````````````````````````` + + +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + +```````````````````````````````` example +** foo bar** +. +

      ** foo bar**

      +```````````````````````````````` + + +This is not strong emphasis, because the opening `**` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a**"foo"** +. +

      a**"foo"**

      +```````````````````````````````` + + +Intraword strong emphasis with `**` is permitted: + +```````````````````````````````` example +foo**bar** +. +

      foobar

      +```````````````````````````````` + + +Rule 6: + +```````````````````````````````` example +__foo bar__ +. +

      foo bar

      +```````````````````````````````` + + +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + +```````````````````````````````` example +__ foo bar__ +. +

      __ foo bar__

      +```````````````````````````````` + + +A newline counts as whitespace: +```````````````````````````````` example +__ +foo bar__ +. +

      __ +foo bar__

      +```````````````````````````````` + + +This is not strong emphasis, because the opening `__` is preceded +by an alphanumeric and followed by punctuation: + +```````````````````````````````` example +a__"foo"__ +. +

      a__"foo"__

      +```````````````````````````````` + + +Intraword strong emphasis is forbidden with `__`: + +```````````````````````````````` example +foo__bar__ +. +

      foo__bar__

      +```````````````````````````````` + + +```````````````````````````````` example +5__6__78 +. +

      5__6__78

      +```````````````````````````````` + + +```````````````````````````````` example +пристаням__стремятся__ +. +

      пристаням__стремятся__

      +```````````````````````````````` + + +```````````````````````````````` example +__foo, __bar__, baz__ +. +

      foo, bar, baz

      +```````````````````````````````` + + +This is strong emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: + +```````````````````````````````` example +foo-__(bar)__ +. +

      foo-(bar)

      +```````````````````````````````` + + + +Rule 7: + +This is not strong emphasis, because the closing delimiter is preceded +by whitespace: + +```````````````````````````````` example +**foo bar ** +. +

      **foo bar **

      +```````````````````````````````` + + +(Nor can it be interpreted as an emphasized `*foo bar *`, because of +Rule 11.) + +This is not strong emphasis, because the second `**` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +**(**foo) +. +

      **(**foo)

      +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with these examples: + +```````````````````````````````` example +*(**foo**)* +. +

      (foo)

      +```````````````````````````````` + + +```````````````````````````````` example +**Gomphocarpus (*Gomphocarpus physocarpus*, syn. +*Asclepias physocarpa*)** +. +

      Gomphocarpus (Gomphocarpus physocarpus, syn. +Asclepias physocarpa)

      +```````````````````````````````` + + +```````````````````````````````` example +**foo "*bar*" foo** +. +

      foo "bar" foo

      +```````````````````````````````` + + +Intraword emphasis: + +```````````````````````````````` example +**foo**bar +. +

      foobar

      +```````````````````````````````` + + +Rule 8: + +This is not strong emphasis, because the closing delimiter is +preceded by whitespace: + +```````````````````````````````` example +__foo bar __ +. +

      __foo bar __

      +```````````````````````````````` + + +This is not strong emphasis, because the second `__` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +__(__foo) +. +

      __(__foo)

      +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with this example: + +```````````````````````````````` example +_(__foo__)_ +. +

      (foo)

      +```````````````````````````````` + + +Intraword strong emphasis is forbidden with `__`: + +```````````````````````````````` example +__foo__bar +. +

      __foo__bar

      +```````````````````````````````` + + +```````````````````````````````` example +__пристаням__стремятся +. +

      __пристаням__стремятся

      +```````````````````````````````` + + +```````````````````````````````` example +__foo__bar__baz__ +. +

      foo__bar__baz

      +```````````````````````````````` + + +This is strong emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +```````````````````````````````` example +__(bar)__. +. +

      (bar).

      +```````````````````````````````` + + +Rule 9: + +Any nonempty sequence of inline elements can be the contents of an +emphasized span. + +```````````````````````````````` example +*foo [bar](/url)* +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo +bar* +. +

      foo +bar

      +```````````````````````````````` + + +In particular, emphasis and strong emphasis can be nested +inside emphasis: + +```````````````````````````````` example +_foo __bar__ baz_ +. +

      foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +_foo _bar_ baz_ +. +

      foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +__foo_ bar_ +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo *bar** +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo **bar** baz* +. +

      foo bar baz

      +```````````````````````````````` + +```````````````````````````````` example +*foo**bar**baz* +. +

      foobarbaz

      +```````````````````````````````` + +Note that in the preceding case, the interpretation + +``` markdown +

      foobarbaz

      +``` + + +is precluded by the condition that a delimiter that +can both open and close (like the `*` after `foo`) +cannot form emphasis if the sum of the lengths of +the delimiter runs containing the opening and +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. + + +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* +. +

      foo**bar

      +```````````````````````````````` + + +The same condition ensures that the following +cases are all strong emphasis nested inside +emphasis, even when the interior spaces are +omitted: + + +```````````````````````````````` example +***foo** bar* +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo **bar*** +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo**bar*** +. +

      foobar

      +```````````````````````````````` + + +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +

      foobarbaz

      +```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz +. +

      foobar***baz

      +```````````````````````````````` + + +Indefinite levels of nesting are possible: + +```````````````````````````````` example +*foo **bar *baz* bim** bop* +. +

      foo bar baz bim bop

      +```````````````````````````````` + + +```````````````````````````````` example +*foo [*bar*](/url)* +. +

      foo bar

      +```````````````````````````````` + + +There can be no empty emphasis or strong emphasis: + +```````````````````````````````` example +** is not an empty emphasis +. +

      ** is not an empty emphasis

      +```````````````````````````````` + + +```````````````````````````````` example +**** is not an empty strong emphasis +. +

      **** is not an empty strong emphasis

      +```````````````````````````````` + + + +Rule 10: + +Any nonempty sequence of inline elements can be the contents of an +strongly emphasized span. + +```````````````````````````````` example +**foo [bar](/url)** +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +**foo +bar** +. +

      foo +bar

      +```````````````````````````````` + + +In particular, emphasis and strong emphasis can be nested +inside strong emphasis: + +```````````````````````````````` example +__foo _bar_ baz__ +. +

      foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +__foo __bar__ baz__ +. +

      foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +____foo__ bar__ +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +**foo **bar**** +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +**foo *bar* baz** +. +

      foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +**foo*bar*baz** +. +

      foobarbaz

      +```````````````````````````````` + + +```````````````````````````````` example +***foo* bar** +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +**foo *bar*** +. +

      foo bar

      +```````````````````````````````` + + +Indefinite levels of nesting are possible: + +```````````````````````````````` example +**foo *bar **baz** +bim* bop** +. +

      foo bar baz +bim bop

      +```````````````````````````````` + + +```````````````````````````````` example +**foo [*bar*](/url)** +. +

      foo bar

      +```````````````````````````````` + + +There can be no empty emphasis or strong emphasis: + +```````````````````````````````` example +__ is not an empty emphasis +. +

      __ is not an empty emphasis

      +```````````````````````````````` + + +```````````````````````````````` example +____ is not an empty strong emphasis +. +

      ____ is not an empty strong emphasis

      +```````````````````````````````` + + + +Rule 11: + +```````````````````````````````` example +foo *** +. +

      foo ***

      +```````````````````````````````` + + +```````````````````````````````` example +foo *\** +. +

      foo *

      +```````````````````````````````` + + +```````````````````````````````` example +foo *_* +. +

      foo _

      +```````````````````````````````` + + +```````````````````````````````` example +foo ***** +. +

      foo *****

      +```````````````````````````````` + + +```````````````````````````````` example +foo **\*** +. +

      foo *

      +```````````````````````````````` + + +```````````````````````````````` example +foo **_** +. +

      foo _

      +```````````````````````````````` + + +Note that when delimiters do not match evenly, Rule 11 determines +that the excess literal `*` characters will appear outside of the +emphasis, rather than inside it: + +```````````````````````````````` example +**foo* +. +

      *foo

      +```````````````````````````````` + + +```````````````````````````````` example +*foo** +. +

      foo*

      +```````````````````````````````` + + +```````````````````````````````` example +***foo** +. +

      *foo

      +```````````````````````````````` + + +```````````````````````````````` example +****foo* +. +

      ***foo

      +```````````````````````````````` + + +```````````````````````````````` example +**foo*** +. +

      foo*

      +```````````````````````````````` + + +```````````````````````````````` example +*foo**** +. +

      foo***

      +```````````````````````````````` + + + +Rule 12: + +```````````````````````````````` example +foo ___ +. +

      foo ___

      +```````````````````````````````` + + +```````````````````````````````` example +foo _\__ +. +

      foo _

      +```````````````````````````````` + + +```````````````````````````````` example +foo _*_ +. +

      foo *

      +```````````````````````````````` + + +```````````````````````````````` example +foo _____ +. +

      foo _____

      +```````````````````````````````` + + +```````````````````````````````` example +foo __\___ +. +

      foo _

      +```````````````````````````````` + + +```````````````````````````````` example +foo __*__ +. +

      foo *

      +```````````````````````````````` + + +```````````````````````````````` example +__foo_ +. +

      _foo

      +```````````````````````````````` + + +Note that when delimiters do not match evenly, Rule 12 determines +that the excess literal `_` characters will appear outside of the +emphasis, rather than inside it: + +```````````````````````````````` example +_foo__ +. +

      foo_

      +```````````````````````````````` + + +```````````````````````````````` example +___foo__ +. +

      _foo

      +```````````````````````````````` + + +```````````````````````````````` example +____foo_ +. +

      ___foo

      +```````````````````````````````` + + +```````````````````````````````` example +__foo___ +. +

      foo_

      +```````````````````````````````` + + +```````````````````````````````` example +_foo____ +. +

      foo___

      +```````````````````````````````` + + +Rule 13 implies that if you want emphasis nested directly inside +emphasis, you must use different delimiters: + +```````````````````````````````` example +**foo** +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +*_foo_* +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +__foo__ +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +_*foo*_ +. +

      foo

      +```````````````````````````````` + + +However, strong emphasis within strong emphasis is possible without +switching delimiters: + +```````````````````````````````` example +****foo**** +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +____foo____ +. +

      foo

      +```````````````````````````````` + + + +Rule 13 can be applied to arbitrarily long sequences of +delimiters: + +```````````````````````````````` example +******foo****** +. +

      foo

      +```````````````````````````````` + + +Rule 14: + +```````````````````````````````` example +***foo*** +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +_____foo_____ +. +

      foo

      +```````````````````````````````` + + +Rule 15: + +```````````````````````````````` example +*foo _bar* baz_ +. +

      foo _bar baz_

      +```````````````````````````````` + + +```````````````````````````````` example +*foo __bar *baz bim__ bam* +. +

      foo bar *baz bim bam

      +```````````````````````````````` + + +Rule 16: + +```````````````````````````````` example +**foo **bar baz** +. +

      **foo bar baz

      +```````````````````````````````` + + +```````````````````````````````` example +*foo *bar baz* +. +

      *foo bar baz

      +```````````````````````````````` + + +Rule 17: + +```````````````````````````````` example +*[bar*](/url) +. +

      *bar*

      +```````````````````````````````` + + +```````````````````````````````` example +_foo [bar_](/url) +. +

      _foo bar_

      +```````````````````````````````` + + +```````````````````````````````` example +* +. +

      *

      +```````````````````````````````` + + +```````````````````````````````` example +** +. +

      **

      +```````````````````````````````` + + +```````````````````````````````` example +__ +. +

      __

      +```````````````````````````````` + + +```````````````````````````````` example +*a `*`* +. +

      a *

      +```````````````````````````````` + + +```````````````````````````````` example +_a `_`_ +. +

      a _

      +```````````````````````````````` + + +```````````````````````````````` example +**a +. +

      **ahttp://foo.bar/?q=**

      +```````````````````````````````` + + +```````````````````````````````` example +__a +. +

      __ahttp://foo.bar/?q=__

      +```````````````````````````````` + + + +## Links + +A link contains [link text] (the visible text), a [link destination] +(the URI that is the link destination), and optionally a [link title]. +There are two basic kinds of links in Markdown. In [inline links] the +destination and title are given immediately after the link text. In +[reference links] the destination and title are defined elsewhere in +the document. + +A [link text](@) consists of a sequence of zero or more +inline elements enclosed by square brackets (`[` and `]`). The +following rules apply: + +- Links may not contain other links, at any level of nesting. If + multiple otherwise valid link definitions appear nested inside each + other, the inner-most definition is used. + +- Brackets are allowed in the [link text] only if (a) they + are backslash-escaped or (b) they appear as a matched pair of brackets, + with an open bracket `[`, a sequence of zero or more inlines, and + a close bracket `]`. + +- Backtick [code spans], [autolinks], and raw [HTML tags] bind more tightly + than the brackets in link text. Thus, for example, + `` [foo`]` `` could not be a link text, since the second `]` + is part of a code span. + +- The brackets in link text bind more tightly than markers for + [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link. + +A [link destination](@) consists of either + +- a sequence of zero or more characters between an opening `<` and a + closing `>` that contains no line breaks or unescaped + `<` or `>` characters, or + +- a nonempty sequence of characters that does not start with + `<`, does not include ASCII space or control characters, and + includes parentheses only if (a) they are backslash-escaped or + (b) they are part of a balanced pair of unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) + +A [link title](@) consists of either + +- a sequence of zero or more characters between straight double-quote + characters (`"`), including a `"` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between straight single-quote + characters (`'`), including a `'` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between matching parentheses + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. + +Although [link titles] may span multiple lines, they may not contain +a [blank line]. + +An [inline link](@) consists of a [link text] followed immediately +by a left parenthesis `(`, optional [whitespace], an optional +[link destination], an optional [link title] separated from the link +destination by [whitespace], optional [whitespace], and a right +parenthesis `)`. The link's text consists of the inlines contained +in the [link text] (excluding the enclosing square brackets). +The link's URI consists of the link destination, excluding enclosing +`<...>` if present, with backslash-escapes in effect as described +above. The link's title consists of the link title, excluding its +enclosing delimiters, with backslash-escapes in effect as described +above. + +Here is a simple inline link: + +```````````````````````````````` example +[link](/uri "title") +. +

      link

      +```````````````````````````````` + + +The title may be omitted: + +```````````````````````````````` example +[link](/uri) +. +

      link

      +```````````````````````````````` + + +Both the title and the destination may be omitted: + +```````````````````````````````` example +[link]() +. +

      link

      +```````````````````````````````` + + +```````````````````````````````` example +[link](<>) +. +

      link

      +```````````````````````````````` + +The destination can only contain spaces if it is +enclosed in pointy brackets: + +```````````````````````````````` example +[link](/my uri) +. +

      [link](/my uri)

      +```````````````````````````````` + +```````````````````````````````` example +[link](
      ) +. +

      link

      +```````````````````````````````` + +The destination cannot contain line breaks, +even if enclosed in pointy brackets: + +```````````````````````````````` example +[link](foo +bar) +. +

      [link](foo +bar)

      +```````````````````````````````` + +```````````````````````````````` example +[link]() +. +

      [link]()

      +```````````````````````````````` + +The destination can contain `)` if it is enclosed +in pointy brackets: + +```````````````````````````````` example +[a]() +. +

      a

      +```````````````````````````````` + +Pointy brackets that enclose links must be unescaped: + +```````````````````````````````` example +[link]() +. +

      [link](<foo>)

      +```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a]( +[a](c) +. +

      [a](<b)c +[a](<b)c> +[a](c)

      +```````````````````````````````` + +Parentheses inside the link destination may be escaped: + +```````````````````````````````` example +[link](\(foo\)) +. +

      link

      +```````````````````````````````` + +Any number of parentheses are allowed without escaping, as long as they are +balanced: + +```````````````````````````````` example +[link](foo(and(bar))) +. +

      link

      +```````````````````````````````` + +However, if you have unbalanced parentheses, you need to escape or use the +`<...>` form: + +```````````````````````````````` example +[link](foo\(and\(bar\)) +. +

      link

      +```````````````````````````````` + + +```````````````````````````````` example +[link]() +. +

      link

      +```````````````````````````````` + + +Parentheses and other symbols can also be escaped, as usual +in Markdown: + +```````````````````````````````` example +[link](foo\)\:) +. +

      link

      +```````````````````````````````` + + +A link can contain fragment identifiers and queries: + +```````````````````````````````` example +[link](#fragment) + +[link](http://example.com#fragment) + +[link](http://example.com?foo=3#frag) +. +

      link

      +

      link

      +

      link

      +```````````````````````````````` + + +Note that a backslash before a non-escapable character is +just a backslash: + +```````````````````````````````` example +[link](foo\bar) +. +

      link

      +```````````````````````````````` + + +URL-escaping should be left alone inside the destination, as all +URL-escaped characters are also valid URL characters. Entity and +numerical character references in the destination will be parsed +into the corresponding Unicode code points, as usual. These may +be optionally URL-escaped when written as HTML, but this spec +does not enforce any particular policy for rendering URLs in +HTML or other formats. Renderers may make different decisions +about how to escape or normalize URLs in the output. + +```````````````````````````````` example +[link](foo%20bä) +. +

      link

      +```````````````````````````````` + + +Note that, because titles can often be parsed as destinations, +if you try to omit the destination and keep the title, you'll +get unexpected results: + +```````````````````````````````` example +[link]("title") +. +

      link

      +```````````````````````````````` + + +Titles may be in single quotes, double quotes, or parentheses: + +```````````````````````````````` example +[link](/url "title") +[link](/url 'title') +[link](/url (title)) +. +

      link +link +link

      +```````````````````````````````` + + +Backslash escapes and entity and numeric character references +may be used in titles: + +```````````````````````````````` example +[link](/url "title \""") +. +

      link

      +```````````````````````````````` + + +Titles must be separated from the link using a [whitespace]. +Other [Unicode whitespace] like non-breaking space doesn't work. + +```````````````````````````````` example +[link](/url "title") +. +

      link

      +```````````````````````````````` + + +Nested balanced quotes are not allowed without escaping: + +```````````````````````````````` example +[link](/url "title "and" title") +. +

      [link](/url "title "and" title")

      +```````````````````````````````` + + +But it is easy to work around this by using a different quote type: + +```````````````````````````````` example +[link](/url 'title "and" title') +. +

      link

      +```````````````````````````````` + + +(Note: `Markdown.pl` did allow double quotes inside a double-quoted +title, and its test suite included a test demonstrating this. +But it is hard to see a good rationale for the extra complexity this +brings, since there are already many ways---backslash escaping, +entity and numeric character references, or using a different +quote type for the enclosing title---to write titles containing +double quotes. `Markdown.pl`'s handling of titles has a number +of other strange features. For example, it allows single-quoted +titles in inline links, but not reference links. And, in +reference links but not inline links, it allows a title to begin +with `"` and end with `)`. `Markdown.pl` 1.0.1 even allows +titles with no closing quotation mark, though 1.0.2b8 does not. +It seems preferable to adopt a simple, rational rule that works +the same way in inline links and link reference definitions.) + +[Whitespace] is allowed around the destination and title: + +```````````````````````````````` example +[link]( /uri + "title" ) +. +

      link

      +```````````````````````````````` + + +But it is not allowed between the link text and the +following parenthesis: + +```````````````````````````````` example +[link] (/uri) +. +

      [link] (/uri)

      +```````````````````````````````` + + +The link text may contain balanced brackets, but not unbalanced ones, +unless they are escaped: + +```````````````````````````````` example +[link [foo [bar]]](/uri) +. +

      link [foo [bar]]

      +```````````````````````````````` + + +```````````````````````````````` example +[link] bar](/uri) +. +

      [link] bar](/uri)

      +```````````````````````````````` + + +```````````````````````````````` example +[link [bar](/uri) +. +

      [link bar

      +```````````````````````````````` + + +```````````````````````````````` example +[link \[bar](/uri) +. +

      link [bar

      +```````````````````````````````` + + +The link text may contain inline content: + +```````````````````````````````` example +[link *foo **bar** `#`*](/uri) +. +

      link foo bar #

      +```````````````````````````````` + + +```````````````````````````````` example +[![moon](moon.jpg)](/uri) +. +

      moon

      +```````````````````````````````` + + +However, links may not contain other links, at any level of nesting. + +```````````````````````````````` example +[foo [bar](/uri)](/uri) +. +

      [foo bar](/uri)

      +```````````````````````````````` + + +```````````````````````````````` example +[foo *[bar [baz](/uri)](/uri)*](/uri) +. +

      [foo [bar baz](/uri)](/uri)

      +```````````````````````````````` + + +```````````````````````````````` example +![[[foo](uri1)](uri2)](uri3) +. +

      [foo](uri2)

      +```````````````````````````````` + + +These cases illustrate the precedence of link text grouping over +emphasis grouping: + +```````````````````````````````` example +*[foo*](/uri) +. +

      *foo*

      +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar](baz*) +. +

      foo *bar

      +```````````````````````````````` + + +Note that brackets that *aren't* part of links do not take +precedence: + +```````````````````````````````` example +*foo [bar* baz] +. +

      foo [bar baz]

      +```````````````````````````````` + + +These cases illustrate the precedence of HTML tags, code spans, +and autolinks over link grouping: + +```````````````````````````````` example +[foo +. +

      [foo

      +```````````````````````````````` + + +```````````````````````````````` example +[foo`](/uri)` +. +

      [foo](/uri)

      +```````````````````````````````` + + +```````````````````````````````` example +[foo +. +

      [foohttp://example.com/?search=](uri)

      +```````````````````````````````` + + +There are three kinds of [reference link](@)s: +[full](#full-reference-link), [collapsed](#collapsed-reference-link), +and [shortcut](#shortcut-reference-link). + +A [full reference link](@) +consists of a [link text] immediately followed by a [link label] +that [matches] a [link reference definition] elsewhere in the document. + +A [link label](@) begins with a left bracket (`[`) and ends +with the first right bracket (`]`) that is not backslash-escaped. +Between these brackets there must be at least one [non-whitespace character]. +Unescaped square bracket characters are not allowed inside the +opening and closing square brackets of [link labels]. A link +label can have at most 999 characters inside the square +brackets. + +One label [matches](@) +another just in case their normalized forms are equal. To normalize a +label, strip off the opening and closing brackets, +perform the *Unicode case fold*, strip leading and trailing +[whitespace] and collapse consecutive internal +[whitespace] to a single space. If there are multiple +matching reference link definitions, the one that comes first in the +document is used. (It is desirable in such cases to emit a warning.) + +The link's URI and title are provided by the matching [link +reference definition]. + +Here is a simple example: + +```````````````````````````````` example +[foo][bar] + +[bar]: /url "title" +. +

      foo

      +```````````````````````````````` + + +The rules for the [link text] are the same as with +[inline links]. Thus: + +The link text may contain balanced brackets, but not unbalanced ones, +unless they are escaped: + +```````````````````````````````` example +[link [foo [bar]]][ref] + +[ref]: /uri +. +

      link [foo [bar]]

      +```````````````````````````````` + + +```````````````````````````````` example +[link \[bar][ref] + +[ref]: /uri +. +

      link [bar

      +```````````````````````````````` + + +The link text may contain inline content: + +```````````````````````````````` example +[link *foo **bar** `#`*][ref] + +[ref]: /uri +. +

      link foo bar #

      +```````````````````````````````` + + +```````````````````````````````` example +[![moon](moon.jpg)][ref] + +[ref]: /uri +. +

      moon

      +```````````````````````````````` + + +However, links may not contain other links, at any level of nesting. + +```````````````````````````````` example +[foo [bar](/uri)][ref] + +[ref]: /uri +. +

      [foo bar]ref

      +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar [baz][ref]*][ref] + +[ref]: /uri +. +

      [foo bar baz]ref

      +```````````````````````````````` + + +(In the examples above, we have two [shortcut reference links] +instead of one [full reference link].) + +The following cases illustrate the precedence of link text grouping over +emphasis grouping: + +```````````````````````````````` example +*[foo*][ref] + +[ref]: /uri +. +

      *foo*

      +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar][ref]* + +[ref]: /uri +. +

      foo *bar*

      +```````````````````````````````` + + +These cases illustrate the precedence of HTML tags, code spans, +and autolinks over link grouping: + +```````````````````````````````` example +[foo + +[ref]: /uri +. +

      [foo

      +```````````````````````````````` + + +```````````````````````````````` example +[foo`][ref]` + +[ref]: /uri +. +

      [foo][ref]

      +```````````````````````````````` + + +```````````````````````````````` example +[foo + +[ref]: /uri +. +

      [foohttp://example.com/?search=][ref]

      +```````````````````````````````` + + +Matching is case-insensitive: + +```````````````````````````````` example +[foo][BaR] + +[bar]: /url "title" +. +

      foo

      +```````````````````````````````` + + +Unicode case fold is used: + +```````````````````````````````` example +[ẞ] + +[SS]: /url +. +

      +```````````````````````````````` + + +Consecutive internal [whitespace] is treated as one space for +purposes of determining matching: + +```````````````````````````````` example +[Foo + bar]: /url + +[Baz][Foo bar] +. +

      Baz

      +```````````````````````````````` + + +No [whitespace] is allowed between the [link text] and the +[link label]: + +```````````````````````````````` example +[foo] [bar] + +[bar]: /url "title" +. +

      [foo] bar

      +```````````````````````````````` + + +```````````````````````````````` example +[foo] +[bar] + +[bar]: /url "title" +. +

      [foo] +bar

      +```````````````````````````````` + + +This is a departure from John Gruber's original Markdown syntax +description, which explicitly allows whitespace between the link +text and the link label. It brings reference links in line with +[inline links], which (according to both original Markdown and +this spec) cannot have whitespace after the link text. More +importantly, it prevents inadvertent capture of consecutive +[shortcut reference links]. If whitespace is allowed between the +link text and the link label, then in the following we will have +a single reference link, not two shortcut reference links, as +intended: + +``` markdown +[foo] +[bar] + +[foo]: /url1 +[bar]: /url2 +``` + +(Note that [shortcut reference links] were introduced by Gruber +himself in a beta version of `Markdown.pl`, but never included +in the official syntax description. Without shortcut reference +links, it is harmless to allow space between the link text and +link label; but once shortcut references are introduced, it is +too dangerous to allow this, as it frequently leads to +unintended results.) + +When there are multiple matching [link reference definitions], +the first is used: + +```````````````````````````````` example +[foo]: /url1 + +[foo]: /url2 + +[bar][foo] +. +

      bar

      +```````````````````````````````` + + +Note that matching is performed on normalized strings, not parsed +inline content. So the following does not match, even though the +labels define equivalent inline content: + +```````````````````````````````` example +[bar][foo\!] + +[foo!]: /url +. +

      [bar][foo!]

      +```````````````````````````````` + + +[Link labels] cannot contain brackets, unless they are +backslash-escaped: + +```````````````````````````````` example +[foo][ref[] + +[ref[]: /uri +. +

      [foo][ref[]

      +

      [ref[]: /uri

      +```````````````````````````````` + + +```````````````````````````````` example +[foo][ref[bar]] + +[ref[bar]]: /uri +. +

      [foo][ref[bar]]

      +

      [ref[bar]]: /uri

      +```````````````````````````````` + + +```````````````````````````````` example +[[[foo]]] + +[[[foo]]]: /url +. +

      [[[foo]]]

      +

      [[[foo]]]: /url

      +```````````````````````````````` + + +```````````````````````````````` example +[foo][ref\[] + +[ref\[]: /uri +. +

      foo

      +```````````````````````````````` + + +Note that in this example `]` is not backslash-escaped: + +```````````````````````````````` example +[bar\\]: /uri + +[bar\\] +. +

      bar\

      +```````````````````````````````` + + +A [link label] must contain at least one [non-whitespace character]: + +```````````````````````````````` example +[] + +[]: /uri +. +

      []

      +

      []: /uri

      +```````````````````````````````` + + +```````````````````````````````` example +[ + ] + +[ + ]: /uri +. +

      [ +]

      +

      [ +]: /uri

      +```````````````````````````````` + + +A [collapsed reference link](@) +consists of a [link label] that [matches] a +[link reference definition] elsewhere in the +document, followed by the string `[]`. +The contents of the first link label are parsed as inlines, +which are used as the link's text. The link's URI and title are +provided by the matching reference link definition. Thus, +`[foo][]` is equivalent to `[foo][foo]`. + +```````````````````````````````` example +[foo][] + +[foo]: /url "title" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +[*foo* bar][] + +[*foo* bar]: /url "title" +. +

      foo bar

      +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +[Foo][] + +[foo]: /url "title" +. +

      Foo

      +```````````````````````````````` + + + +As with full reference links, [whitespace] is not +allowed between the two sets of brackets: + +```````````````````````````````` example +[foo] +[] + +[foo]: /url "title" +. +

      foo +[]

      +```````````````````````````````` + + +A [shortcut reference link](@) +consists of a [link label] that [matches] a +[link reference definition] elsewhere in the +document and is not followed by `[]` or a link label. +The contents of the first link label are parsed as inlines, +which are used as the link's text. The link's URI and title +are provided by the matching link reference definition. +Thus, `[foo]` is equivalent to `[foo][]`. + +```````````````````````````````` example +[foo] + +[foo]: /url "title" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +[*foo* bar] + +[*foo* bar]: /url "title" +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +[[*foo* bar]] + +[*foo* bar]: /url "title" +. +

      [foo bar]

      +```````````````````````````````` + + +```````````````````````````````` example +[[bar [foo] + +[foo]: /url +. +

      [[bar foo

      +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +[Foo] + +[foo]: /url "title" +. +

      Foo

      +```````````````````````````````` + + +A space after the link text should be preserved: + +```````````````````````````````` example +[foo] bar + +[foo]: /url +. +

      foo bar

      +```````````````````````````````` + + +If you just want bracketed text, you can backslash-escape the +opening bracket to avoid links: + +```````````````````````````````` example +\[foo] + +[foo]: /url "title" +. +

      [foo]

      +```````````````````````````````` + + +Note that this is a link, because a link label ends with the first +following closing bracket: + +```````````````````````````````` example +[foo*]: /url + +*[foo*] +. +

      *foo*

      +```````````````````````````````` + + +Full and compact references take precedence over shortcut +references: + +```````````````````````````````` example +[foo][bar] + +[foo]: /url1 +[bar]: /url2 +. +

      foo

      +```````````````````````````````` + +```````````````````````````````` example +[foo][] + +[foo]: /url1 +. +

      foo

      +```````````````````````````````` + +Inline links also take precedence: + +```````````````````````````````` example +[foo]() + +[foo]: /url1 +. +

      foo

      +```````````````````````````````` + +```````````````````````````````` example +[foo](not a link) + +[foo]: /url1 +. +

      foo(not a link)

      +```````````````````````````````` + +In the following case `[bar][baz]` is parsed as a reference, +`[foo]` as normal text: + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url +. +

      [foo]bar

      +```````````````````````````````` + + +Here, though, `[foo][bar]` is parsed as a reference, since +`[bar]` is defined: + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url1 +[bar]: /url2 +. +

      foobaz

      +```````````````````````````````` + + +Here `[foo]` is not parsed as a shortcut reference, because it +is followed by a link label (even though `[bar]` is not defined): + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url1 +[foo]: /url2 +. +

      [foo]bar

      +```````````````````````````````` + + + +## Images + +Syntax for images is like the syntax for links, with one +difference. Instead of [link text], we have an +[image description](@). The rules for this are the +same as for [link text], except that (a) an +image description starts with `![` rather than `[`, and +(b) an image description may contain links. +An image description has inline elements +as its contents. When an image is rendered to HTML, +this is standardly used as the image's `alt` attribute. + +```````````````````````````````` example +![foo](/url "title") +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +![foo *bar*] + +[foo *bar*]: train.jpg "train & tracks" +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +![foo ![bar](/url)](/url2) +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +![foo [bar](/url)](/url2) +. +

      foo bar

      +```````````````````````````````` + + +Though this spec is concerned with parsing, not rendering, it is +recommended that in rendering to HTML, only the plain string content +of the [image description] be used. Note that in +the above example, the alt attribute's value is `foo bar`, not `foo +[bar](/url)` or `foo bar`. Only the plain string +content is rendered, without formatting. + +```````````````````````````````` example +![foo *bar*][] + +[foo *bar*]: train.jpg "train & tracks" +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +![foo *bar*][foobar] + +[FOOBAR]: train.jpg "train & tracks" +. +

      foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +![foo](train.jpg) +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +My ![foo bar](/path/to/train.jpg "title" ) +. +

      My foo bar

      +```````````````````````````````` + + +```````````````````````````````` example +![foo]() +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +![](/url) +. +

      +```````````````````````````````` + + +Reference-style: + +```````````````````````````````` example +![foo][bar] + +[bar]: /url +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +![foo][bar] + +[BAR]: /url +. +

      foo

      +```````````````````````````````` + + +Collapsed: + +```````````````````````````````` example +![foo][] + +[foo]: /url "title" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +![*foo* bar][] + +[*foo* bar]: /url "title" +. +

      foo bar

      +```````````````````````````````` + + +The labels are case-insensitive: + +```````````````````````````````` example +![Foo][] + +[foo]: /url "title" +. +

      Foo

      +```````````````````````````````` + + +As with reference links, [whitespace] is not allowed +between the two sets of brackets: + +```````````````````````````````` example +![foo] +[] + +[foo]: /url "title" +. +

      foo +[]

      +```````````````````````````````` + + +Shortcut: + +```````````````````````````````` example +![foo] + +[foo]: /url "title" +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +![*foo* bar] + +[*foo* bar]: /url "title" +. +

      foo bar

      +```````````````````````````````` + + +Note that link labels cannot contain unescaped brackets: + +```````````````````````````````` example +![[foo]] + +[[foo]]: /url "title" +. +

      ![[foo]]

      +

      [[foo]]: /url "title"

      +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +![Foo] + +[foo]: /url "title" +. +

      Foo

      +```````````````````````````````` + + +If you just want a literal `!` followed by bracketed text, you can +backslash-escape the opening `[`: + +```````````````````````````````` example +!\[foo] + +[foo]: /url "title" +. +

      ![foo]

      +```````````````````````````````` + + +If you want a link after a literal `!`, backslash-escape the +`!`: + +```````````````````````````````` example +\![foo] + +[foo]: /url "title" +. +

      !foo

      +```````````````````````````````` + + +## Autolinks + +[Autolink](@)s are absolute URIs and email addresses inside +`<` and `>`. They are parsed as links, with the URL or email address +as the link label. + +A [URI autolink](@) consists of `<`, followed by an +[absolute URI] followed by `>`. It is parsed as +a link to the URI, with the URI as the link's label. + +An [absolute URI](@), +for these purposes, consists of a [scheme] followed by a colon (`:`) +followed by zero or more characters other than ASCII +[whitespace] and control characters, `<`, and `>`. If +the URI includes these characters, they must be percent-encoded +(e.g. `%20` for a space). + +For purposes of this spec, a [scheme](@) is any sequence +of 2--32 characters beginning with an ASCII letter and followed +by any combination of ASCII letters, digits, or the symbols plus +("+"), period ("."), or hyphen ("-"). + +Here are some valid autolinks: + +```````````````````````````````` example + +. +

      http://foo.bar.baz

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      http://foo.bar.baz/test?q=hello&id=22&boolean

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      irc://foo.bar:2233/baz

      +```````````````````````````````` + + +Uppercase is also fine: + +```````````````````````````````` example + +. +

      MAILTO:FOO@BAR.BAZ

      +```````````````````````````````` + + +Note that many strings that count as [absolute URIs] for +purposes of this spec are not valid URIs, because their +schemes are not registered or because of other problems +with their syntax: + +```````````````````````````````` example + +. +

      a+b+c:d

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      made-up-scheme://foo,bar

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      http://../

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      localhost:5001/foo

      +```````````````````````````````` + + +Spaces are not allowed in autolinks: + +```````````````````````````````` example + +. +

      <http://foo.bar/baz bim>

      +```````````````````````````````` + + +Backslash-escapes do not work inside autolinks: + +```````````````````````````````` example + +. +

      http://example.com/\[\

      +```````````````````````````````` + + +An [email autolink](@) +consists of `<`, followed by an [email address], +followed by `>`. The link's label is the email address, +and the URL is `mailto:` followed by the email address. + +An [email address](@), +for these purposes, is anything that matches +the [non-normative regex from the HTML5 +spec](https://html.spec.whatwg.org/multipage/forms.html#e-mail-state-(type=email)): + + /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + +Examples of email autolinks: + +```````````````````````````````` example + +. +

      foo@bar.example.com

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      foo+special@Bar.baz-bar0.com

      +```````````````````````````````` + + +Backslash-escapes do not work inside email autolinks: + +```````````````````````````````` example + +. +

      <foo+@bar.example.com>

      +```````````````````````````````` + + +These are not autolinks: + +```````````````````````````````` example +<> +. +

      <>

      +```````````````````````````````` + + +```````````````````````````````` example +< http://foo.bar > +. +

      < http://foo.bar >

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      <m:abc>

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      <foo.bar.baz>

      +```````````````````````````````` + + +```````````````````````````````` example +http://example.com +. +

      http://example.com

      +```````````````````````````````` + + +```````````````````````````````` example +foo@bar.example.com +. +

      foo@bar.example.com

      +```````````````````````````````` + + +## Raw HTML + +Text between `<` and `>` that looks like an HTML tag is parsed as a +raw HTML tag and will be rendered in HTML without escaping. +Tag and attribute names are not limited to current HTML tags, +so custom tags (and even, say, DocBook tags) may be used. + +Here is the grammar for tags: + +A [tag name](@) consists of an ASCII letter +followed by zero or more ASCII letters, digits, or +hyphens (`-`). + +An [attribute](@) consists of [whitespace], +an [attribute name], and an optional +[attribute value specification]. + +An [attribute name](@) +consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII +letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML +specification restricted to ASCII. HTML5 is laxer.) + +An [attribute value specification](@) +consists of optional [whitespace], +a `=` character, optional [whitespace], and an [attribute +value]. + +An [attribute value](@) +consists of an [unquoted attribute value], +a [single-quoted attribute value], or a [double-quoted attribute value]. + +An [unquoted attribute value](@) +is a nonempty string of characters not +including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. + +A [single-quoted attribute value](@) +consists of `'`, zero or more +characters not including `'`, and a final `'`. + +A [double-quoted attribute value](@) +consists of `"`, zero or more +characters not including `"`, and a final `"`. + +An [open tag](@) consists of a `<` character, a [tag name], +zero or more [attributes], optional [whitespace], an optional `/` +character, and a `>` character. + +A [closing tag](@) consists of the string ``. + +An [HTML comment](@) consists of ``, +where *text* does not start with `>` or `->`, does not end with `-`, +and does not contain `--`. (See the +[HTML5 spec](http://www.w3.org/TR/html5/syntax.html#comments).) + +A [processing instruction](@) +consists of the string ``, and the string +`?>`. + +A [declaration](@) consists of the +string ``, and the character `>`. + +A [CDATA section](@) consists of +the string ``, and the string `]]>`. + +An [HTML tag](@) consists of an [open tag], a [closing tag], +an [HTML comment], a [processing instruction], a [declaration], +or a [CDATA section]. + +Here are some simple open tags: + +```````````````````````````````` example + +. +

      +```````````````````````````````` + + +Empty elements: + +```````````````````````````````` example + +. +

      +```````````````````````````````` + + +[Whitespace] is allowed: + +```````````````````````````````` example + +. +

      +```````````````````````````````` + + +With attributes: + +```````````````````````````````` example + +. +

      +```````````````````````````````` + + +Custom tag names can be used: + +```````````````````````````````` example +Foo +. +

      Foo

      +```````````````````````````````` + + +Illegal tag names, not parsed as HTML: + +```````````````````````````````` example +<33> <__> +. +

      <33> <__>

      +```````````````````````````````` + + +Illegal attribute names: + +```````````````````````````````` example +
      +. +

      <a h*#ref="hi">

      +```````````````````````````````` + + +Illegal attribute values: + +```````````````````````````````` example +
      +. +

      </a href="foo">

      +```````````````````````````````` + + +Comments: + +```````````````````````````````` example +foo +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +foo +. +

      foo <!-- not a comment -- two hyphens -->

      +```````````````````````````````` + + +Not comments: + +```````````````````````````````` example +foo foo --> + +foo +. +

      foo <!--> foo -->

      +

      foo <!-- foo--->

      +```````````````````````````````` + + +Processing instructions: + +```````````````````````````````` example +foo +. +

      foo

      +```````````````````````````````` + + +Declarations: + +```````````````````````````````` example +foo +. +

      foo

      +```````````````````````````````` + + +CDATA sections: + +```````````````````````````````` example +foo &<]]> +. +

      foo &<]]>

      +```````````````````````````````` + + +Entity and numeric character references are preserved in HTML +attributes: + +```````````````````````````````` example +foo
      +. +

      foo

      +```````````````````````````````` + + +Backslash escapes do not work in HTML attributes: + +```````````````````````````````` example +foo +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      <a href=""">

      +```````````````````````````````` + + +## Hard line breaks + +A line break (not in a code span or HTML tag) that is preceded +by two or more spaces and does not occur at the end of a block +is parsed as a [hard line break](@) (rendered +in HTML as a `
      ` tag): + +```````````````````````````````` example +foo +baz +. +

      foo
      +baz

      +```````````````````````````````` + + +For a more visible alternative, a backslash before the +[line ending] may be used instead of two spaces: + +```````````````````````````````` example +foo\ +baz +. +

      foo
      +baz

      +```````````````````````````````` + + +More than two spaces can be used: + +```````````````````````````````` example +foo +baz +. +

      foo
      +baz

      +```````````````````````````````` + + +Leading spaces at the beginning of the next line are ignored: + +```````````````````````````````` example +foo + bar +. +

      foo
      +bar

      +```````````````````````````````` + + +```````````````````````````````` example +foo\ + bar +. +

      foo
      +bar

      +```````````````````````````````` + + +Line breaks can occur inside emphasis, links, and other constructs +that allow inline content: + +```````````````````````````````` example +*foo +bar* +. +

      foo
      +bar

      +```````````````````````````````` + + +```````````````````````````````` example +*foo\ +bar* +. +

      foo
      +bar

      +```````````````````````````````` + + +Line breaks do not occur inside code spans + +```````````````````````````````` example +`code +span` +. +

      code span

      +```````````````````````````````` + + +```````````````````````````````` example +`code\ +span` +. +

      code\ span

      +```````````````````````````````` + + +or HTML tags: + +```````````````````````````````` example +
      +. +

      +```````````````````````````````` + + +```````````````````````````````` example + +. +

      +```````````````````````````````` + + +Hard line breaks are for separating inline content within a block. +Neither syntax for hard line breaks works at the end of a paragraph or +other block element: + +```````````````````````````````` example +foo\ +. +

      foo\

      +```````````````````````````````` + + +```````````````````````````````` example +foo +. +

      foo

      +```````````````````````````````` + + +```````````````````````````````` example +### foo\ +. +

      foo\

      +```````````````````````````````` + + +```````````````````````````````` example +### foo +. +

      foo

      +```````````````````````````````` + + +## Soft line breaks + +A regular line break (not in a code span or HTML tag) that is not +preceded by two or more spaces or a backslash is parsed as a +[softbreak](@). (A softbreak may be rendered in HTML either as a +[line ending] or as a space. The result will be the same in +browsers. In the examples here, a [line ending] will be used.) + +```````````````````````````````` example +foo +baz +. +

      foo +baz

      +```````````````````````````````` + + +Spaces at the end of the line and beginning of the next line are +removed: + +```````````````````````````````` example +foo + baz +. +

      foo +baz

      +```````````````````````````````` + + +A conforming parser may render a soft line break in HTML either as a +line break or as a space. + +A renderer may also provide an option to render soft line breaks +as hard line breaks. + +## Textual content + +Any characters not given an interpretation by the above rules will +be parsed as plain textual content. + +```````````````````````````````` example +hello $.;'there +. +

      hello $.;'there

      +```````````````````````````````` + + +```````````````````````````````` example +Foo χρῆν +. +

      Foo χρῆν

      +```````````````````````````````` + + +Internal spaces are preserved verbatim: + +```````````````````````````````` example +Multiple spaces +. +

      Multiple spaces

      +```````````````````````````````` + + + + +# Appendix: A parsing strategy + +In this appendix we describe some features of the parsing strategy +used in the CommonMark reference implementations. + +## Overview + +Parsing has two phases: + +1. In the first phase, lines of input are consumed and the block +structure of the document---its division into paragraphs, block quotes, +list items, and so on---is constructed. Text is assigned to these +blocks but not parsed. Link reference definitions are parsed and a +map of links is constructed. + +2. In the second phase, the raw text contents of paragraphs and headings +are parsed into sequences of Markdown inline elements (strings, +code spans, links, emphasis, and so on), using the map of link +references constructed in phase 1. + +At each point in processing, the document is represented as a tree of +**blocks**. The root of the tree is a `document` block. The `document` +may have any number of other blocks as **children**. These children +may, in turn, have other blocks as children. The last child of a block +is normally considered **open**, meaning that subsequent lines of input +can alter its contents. (Blocks that are not open are **closed**.) +Here, for example, is a possible document tree, with the open blocks +marked by arrows: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## Phase 1: block structure + +Each line that is processed has an effect on this tree. The line is +analyzed and, depending on its contents, the document may be altered +in one or more of the following ways: + +1. One or more open blocks may be closed. +2. One or more new blocks may be created as children of the + last open block. +3. Text may be added to the last (deepest) open block remaining + on the tree. + +Once a line has been incorporated into the tree in this way, +it can be discarded, so input can be read in a stream. + +For each line, we follow this procedure: + +1. First we iterate through the open blocks, starting with the +root document, and descending through last children down to the last +open block. Each block imposes a condition that the line must satisfy +if the block is to remain open. For example, a block quote requires a +`>` character. A paragraph requires a non-blank line. +In this phase we may match all or just some of the open +blocks. But we cannot close unmatched blocks yet, because we may have a +[lazy continuation line]. + +2. Next, after consuming the continuation markers for existing +blocks, we look for new block starts (e.g. `>` for a block quote). +If we encounter a new block start, we close any blocks unmatched +in step 1 before creating the new block as a child of the last +matched block. + +3. Finally, we look at the remainder of the line (after block +markers like `>`, list markers, and indentation have been consumed). +This is text that can be incorporated into the last open +block (a paragraph, code block, heading, or raw HTML). + +Setext headings are formed when we see a line of a paragraph +that is a [setext heading underline]. + +Reference link definitions are detected when a paragraph is closed; +the accumulated text lines are parsed to see if they begin with +one or more reference link definitions. Any remainder becomes a +normal paragraph. + +We can see how this works by considering how the tree above is +generated by four lines of Markdown: + +``` markdown +> Lorem ipsum dolor +sit amet. +> - Qui *quodsi iracundia* +> - aliquando id +``` + +At the outset, our document model is just + +``` tree +-> document +``` + +The first line of our text, + +``` markdown +> Lorem ipsum dolor +``` + +causes a `block_quote` block to be created as a child of our +open `document` block, and a `paragraph` block as a child of +the `block_quote`. Then the text is added to the last open +block, the `paragraph`: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor" +``` + +The next line, + +``` markdown +sit amet. +``` + +is a "lazy continuation" of the open `paragraph`, so it gets added +to the paragraph's text: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor\nsit amet." +``` + +The third line, + +``` markdown +> - Qui *quodsi iracundia* +``` + +causes the `paragraph` block to be closed, and a new `list` block +opened as a child of the `block_quote`. A `list_item` is also +added as a child of the `list`, and a `paragraph` as a child of +the `list_item`. The text is then added to the new `paragraph`: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + -> list_item + -> paragraph + "Qui *quodsi iracundia*" +``` + +The fourth line, + +``` markdown +> - aliquando id +``` + +causes the `list_item` (and its child the `paragraph`) to be closed, +and a new `list_item` opened up as child of the `list`. A `paragraph` +is added as a child of the new `list_item`, to contain the text. +We thus obtain the final tree: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## Phase 2: inline structure + +Once all of the input has been parsed, all open blocks are closed. + +We then "walk the tree," visiting every node, and parse raw +string contents of paragraphs and headings as inlines. At this +point we have seen all the link reference definitions, so we can +resolve reference links as we go. + +``` tree +document + block_quote + paragraph + str "Lorem ipsum dolor" + softbreak + str "sit amet." + list (type=bullet tight=true bullet_char=-) + list_item + paragraph + str "Qui " + emph + str "quodsi iracundia" + list_item + paragraph + str "aliquando id" +``` + +Notice how the [line ending] in the first paragraph has +been parsed as a `softbreak`, and the asterisks in the first list item +have become an `emph`. + +### An algorithm for parsing nested emphasis and links + +By far the trickiest part of inline parsing is handling emphasis, +strong emphasis, links, and images. This is done using the following +algorithm. + +When we're parsing inlines and we hit either + +- a run of `*` or `_` characters, or +- a `[` or `![` + +we insert a text node with these symbols as its literal content, and we +add a pointer to this text node to the [delimiter stack](@). + +The [delimiter stack] is a doubly linked list. Each +element contains a pointer to a text node, plus information about + +- the type of delimiter (`[`, `![`, `*`, `_`) +- the number of delimiters, +- whether the delimiter is "active" (all are active to start), and +- whether the delimiter is a potential opener, a potential closer, + or both (which depends on what sort of characters precede + and follow the delimiters). + +When we hit a `]` character, we call the *look for link or image* +procedure (see below). + +When we hit the end of the input, we call the *process emphasis* +procedure (see below), with `stack_bottom` = NULL. + +#### *look for link or image* + +Starting at the top of the delimiter stack, we look backwards +through the stack for an opening `[` or `![` delimiter. + +- If we don't find one, we return a literal text node `]`. + +- If we do find one, but it's not *active*, we remove the inactive + delimiter from the stack, and return a literal text node `]`. + +- If we find one and it's active, then we parse ahead to see if + we have an inline link/image, reference link/image, compact reference + link/image, or shortcut reference link/image. + + + If we don't, then we remove the opening delimiter from the + delimiter stack and return a literal text node `]`. + + + If we do, then + + * We return a link or image node whose children are the inlines + after the text node pointed to by the opening delimiter. + + * We run *process emphasis* on these inlines, with the `[` opener + as `stack_bottom`. + + * We remove the opening delimiter. + + * If we have a link (and not an image), we also set all + `[` delimiters before the opening delimiter to *inactive*. (This + will prevent us from getting links within links.) + +#### *process emphasis* + +Parameter `stack_bottom` sets a lower bound to how far we +descend in the [delimiter stack]. If it is NULL, we can +go all the way to the bottom. Otherwise, we stop before +visiting `stack_bottom`. + +Let `current_position` point to the element on the [delimiter stack] +just above `stack_bottom` (or the first element if `stack_bottom` +is NULL). + +We keep track of the `openers_bottom` for each delimiter +type (`*`, `_`) and each length of the closing delimiter run +(modulo 3). Initialize this to `stack_bottom`. + +Then we repeat the following until we run out of potential +closers: + +- Move `current_position` forward in the delimiter stack (if needed) + until we find the first potential closer with delimiter `*` or `_`. + (This will be the potential closer closest + to the beginning of the input -- the first one in parse order.) + +- Now, look back in the stack (staying above `stack_bottom` and + the `openers_bottom` for this delimiter type) for the + first matching potential opener ("matching" means same delimiter). + +- If one is found: + + + Figure out whether we have emphasis or strong emphasis: + if both closer and opener spans have length >= 2, we have + strong, otherwise regular. + + + Insert an emph or strong emph node accordingly, after + the text node corresponding to the opener. + + + Remove any delimiters between the opener and closer from + the delimiter stack. + + + Remove 1 (for regular emph) or 2 (for strong emph) delimiters + from the opening and closing text nodes. If they become empty + as a result, remove them and remove the corresponding element + of the delimiter stack. If the closing node is removed, reset + `current_position` to the next element in the stack. + +- If none is found: + + + Set `openers_bottom` to the element before `current_position`. + (We know that there are no openers for this kind of closer up to and + including this point, so this puts a lower bound on future searches.) + + + If the closer at `current_position` is not a potential opener, + remove it from the delimiter stack (since we know it can't + be a closer either). + + + Advance `current_position` to the next element in the stack. + +After we're done, we remove all delimiters above `stack_bottom` from the +delimiter stack. + diff --git a/test/spec_tests.py b/test/spec_tests.py new file mode 100755 index 0000000..c739e5f --- /dev/null +++ b/test/spec_tests.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +from difflib import unified_diff +import argparse +import re +import json +from cmark import CMark +from normalize import normalize_html + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run cmark tests.') + parser.add_argument('-p', '--program', dest='program', nargs='?', default=None, + help='program to test') + parser.add_argument('-s', '--spec', dest='spec', nargs='?', default='spec.txt', + help='path to spec') + parser.add_argument('-P', '--pattern', dest='pattern', nargs='?', + default=None, help='limit to sections matching regex pattern') + parser.add_argument('--library-dir', dest='library_dir', nargs='?', + default=None, help='directory containing dynamic library') + parser.add_argument('--no-normalize', dest='normalize', + action='store_const', const=False, default=True, + help='do not normalize HTML') + parser.add_argument('-d', '--dump-tests', dest='dump_tests', + action='store_const', const=True, default=False, + help='dump tests in JSON format') + parser.add_argument('--debug-normalization', dest='debug_normalization', + action='store_const', const=True, + default=False, help='filter stdin through normalizer for testing') + parser.add_argument('-n', '--number', type=int, default=None, + help='only consider the test with the given number') + args = parser.parse_args(sys.argv[1:]) + +def out(str): + sys.stdout.buffer.write(str.encode('utf-8')) + +def print_test_header(headertext, example_number, start_line, end_line): + out("Example %d (lines %d-%d) %s\n" % (example_number,start_line,end_line,headertext)) + +def do_test(test, normalize, result_counts): + [retcode, actual_html, err] = cmark.to_html(test['markdown']) + if retcode == 0: + expected_html = test['html'] + unicode_error = None + if normalize: + try: + passed = normalize_html(actual_html) == normalize_html(expected_html) + except UnicodeDecodeError as e: + unicode_error = e + passed = False + else: + passed = actual_html == expected_html + if passed: + result_counts['pass'] += 1 + else: + print_test_header(test['section'], test['example'], test['start_line'], test['end_line']) + out(test['markdown'] + '\n') + if unicode_error: + out("Unicode error: " + str(unicode_error) + '\n') + out("Expected: " + repr(expected_html) + '\n') + out("Got: " + repr(actual_html) + '\n') + else: + expected_html_lines = expected_html.splitlines(True) + actual_html_lines = actual_html.splitlines(True) + for diffline in unified_diff(expected_html_lines, actual_html_lines, + "expected HTML", "actual HTML"): + out(diffline) + out('\n') + result_counts['fail'] += 1 + else: + print_test_header(test['section'], test['example'], test['start_line'], test['end_line']) + out("program returned error code %d\n" % retcode) + sys.stdout.buffer.write(err) + result_counts['error'] += 1 + +def get_tests(specfile): + line_number = 0 + start_line = 0 + end_line = 0 + example_number = 0 + markdown_lines = [] + html_lines = [] + state = 0 # 0 regular text, 1 markdown example, 2 html output + headertext = '' + tests = [] + + header_re = re.compile('#+ ') + + with open(specfile, 'r', encoding='utf-8', newline='\n') as specf: + for line in specf: + line_number = line_number + 1 + l = line.strip() + #if l == "`" * 32 + " example": + if re.match("`{32} example( [a-z]{1,})?", l): + state = 1 + elif state == 2 and l == "`" * 32: + state = 0 + example_number = example_number + 1 + end_line = line_number + tests.append({ + "markdown":''.join(markdown_lines).replace('→',"\t"), + "html":''.join(html_lines).replace('→',"\t"), + "example": example_number, + "start_line": start_line, + "end_line": end_line, + "section": headertext}) + start_line = 0 + markdown_lines = [] + html_lines = [] + elif l == ".": + state = 2 + elif state == 1: + if start_line == 0: + start_line = line_number - 1 + markdown_lines.append(line) + elif state == 2: + html_lines.append(line) + elif state == 0 and re.match(header_re, line): + headertext = header_re.sub('', line).strip() + return tests + +if __name__ == "__main__": + if args.debug_normalization: + out(normalize_html(sys.stdin.read())) + exit(0) + + all_tests = get_tests(args.spec) + if args.pattern: + pattern_re = re.compile(args.pattern, re.IGNORECASE) + else: + pattern_re = re.compile('.') + tests = [ test for test in all_tests if re.search(pattern_re, test['section']) and (not args.number or test['example'] == args.number) ] + if args.dump_tests: + out(json.dumps(tests, ensure_ascii=False, indent=2)) + exit(0) + else: + skipped = len(all_tests) - len(tests) + cmark = CMark(prog=args.program, library_dir=args.library_dir) + result_counts = {'pass': 0, 'fail': 0, 'error': 0, 'skip': skipped} + for test in tests: + do_test(test, args.normalize, result_counts) + out("{pass} passed, {fail} failed, {error} errored, {skip} skipped\n".format(**result_counts)) + exit(result_counts['fail'] + result_counts['error']) diff --git a/test/strikethrough.txt b/test/strikethrough.txt new file mode 100644 index 0000000..884ce59 --- /dev/null +++ b/test/strikethrough.txt @@ -0,0 +1,75 @@ + +# Strike-Through + +With the flag `MD_FLAG_STRIKETHROUGH`, MD4C enables extension for recognition +of strike-through spans. + +Strike-through text is any text wrapped in one or two tildes (`~`). + +```````````````````````````````` example +~Hi~ Hello, world! +. +

      Hi Hello, world!

      +```````````````````````````````` + +If the length of the opener and closer doesn't match, the strike-through is +not recognized. + +```````````````````````````````` example +This ~text~~ is curious. +. +

      This ~text~~ is curious.

      +```````````````````````````````` + +Too long tilde sequence won't be recognized: + +```````````````````````````````` example +foo ~~~bar~~~ +. +

      foo ~~~bar~~~

      +```````````````````````````````` + +Also note the markers cannot open a strike-through span if they are followed +with a whitespace; and similarly, then cannot close the span if they are +preceded with a whitespace: + +```````````````````````````````` example +~foo ~bar +. +

      ~foo ~bar

      +```````````````````````````````` + + +As with regular emphasis delimiters, a new paragraph will cause the cessation +of parsing a strike-through: + +```````````````````````````````` example +This ~~has a + +new paragraph~~. +. +

      This ~~has a

      +

      new paragraph~~.

      +```````````````````````````````` + + +## GitHub Issues + +### [Issue 69](https://github.com/mity/md4c/issues/69) +```````````````````````````````` example +~`foo`~ +. +

      foo

      +```````````````````````````````` + +```````````````````````````````` example +~*foo*~ +. +

      foo

      +```````````````````````````````` + +```````````````````````````````` example +*~foo~* +. +

      foo

      +```````````````````````````````` diff --git a/test/tables.txt b/test/tables.txt new file mode 100644 index 0000000..80147ab --- /dev/null +++ b/test/tables.txt @@ -0,0 +1,363 @@ + +# Tables + +With the flag `MD_FLAG_TABLES`, MD4C enables extension for recognition of +tables. + +Basic table example of a table with two columns and three lines (when not +counting the header) is as follows: + +```````````````````````````````` example +| Column 1 | Column 2 | +|----------|----------| +| foo | bar | +| baz | qux | +| quux | quuz | +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +The leading and succeeding pipe characters (`|`) on each line are optional: + +```````````````````````````````` example +Column 1 | Column 2 | +---------|--------- | +foo | bar | +baz | qux | +quux | quuz | +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +```````````````````````````````` example +| Column 1 | Column 2 +|----------|--------- +| foo | bar +| baz | qux +| quux | quuz +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +```````````````````````````````` example +Column 1 | Column 2 +---------|--------- +foo | bar +baz | qux +quux | quuz +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +However for one-column table, at least one pipe has to be used in the table +header underline, otherwise it would be parsed as a Setext title followed by +a paragraph. + +```````````````````````````````` example +Column 1 +-------- +foo +baz +quux +. +

      Column 1

      +

      foo +baz +quux

      +```````````````````````````````` + +Leading and trailing whitespace in a table cell is ignored and the columns do +not need to be aligned. + +```````````````````````````````` example +Column 1 |Column 2 +---|--- +foo | bar +baz| qux +quux|quuz +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +The table cannot interrupt a paragraph. + +```````````````````````````````` example +Lorem ipsum dolor sit amet. +| Column 1 | Column 2 +| ---------|--------- +| foo | bar +| baz | qux +| quux | quuz +. +

      Lorem ipsum dolor sit amet. +| Column 1 | Column 2 +| ---------|--------- +| foo | bar +| baz | qux +| quux | quuz

      +```````````````````````````````` + +Similarly, paragraph cannot interrupt a table: + +```````````````````````````````` example +Column 1 | Column 2 +---------|--------- +foo | bar +baz | qux +quux | quuz +Lorem ipsum dolor sit amet. +. + + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      Lorem ipsum dolor sit amet.
      +```````````````````````````````` + +The underline of the table is crucial for recognition of the table, count of +its columns and their alignment: The line has to contain at least one pipe, +and it has provide at least three dash (`-`) characters for every column in +the table. + +Thus this is not a table because there are too few dashes for Column 2. + +```````````````````````````````` example +| Column 1 | Column 2 +| ---------|-- +| foo | bar +| baz | qux +| quux | quuz +. +

      | Column 1 | Column 2 +| ---------|-- +| foo | bar +| baz | qux +| quux | quuz

      +```````````````````````````````` + +The first, the last or both the first and the last dash in each column +underline can be replaced with a colon (`:`) to request left, right or middle +alignment of the respective column: + +```````````````````````````````` example +| Column 1 | Column 2 | Column 3 | Column 4 | +|----------|:---------|:--------:|---------:| +| default | left | center | right | +. + + + + + + + +
      Column 1Column 2Column 3Column 4
      defaultleftcenterright
      +```````````````````````````````` + +To include a literal pipe character in any cell, it has to be escaped. + +```````````````````````````````` example +Column 1 | Column 2 +---------|--------- +foo | bar +baz | qux \| xyzzy +quux | quuz +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux | xyzzy
      quuxquuz
      +```````````````````````````````` + +Contents of each cell is parsed as an inline text which may contents any +inline Markdown spans like emphasis, strong emphasis, links etc. + +```````````````````````````````` example +Column 1 | Column 2 +---------|--------- +*foo* | bar +**baz** | [qux] +quux | [quuz](/url2) + +[qux]: /url +. + + + + + + + + + +
      Column 1Column 2
      foobar
      bazqux
      quuxquuz
      +```````````````````````````````` + +However pipes which are inside a code span are not recognized as cell +boundaries. + +```````````````````````````````` example +Column 1 | Column 2 +---------|--------- +`foo | bar` +baz | qux +quux | quuz +. + + + + + + + + + +
      Column 1Column 2
      foo | bar
      bazqux
      quuxquuz
      +```````````````````````````````` + + +## GitHub Issues + +### [Issue 41](https://github.com/mity/md4c/issues/41) +```````````````````````````````` example +* x|x +---|--- +. +
        +
      • x|x +---|---
      • +
      +```````````````````````````````` +(Not a table, because the underline has wrong indentation and is not part of the +list item.) + +```````````````````````````````` example +* x|x + ---|--- +x|x +. +
        +
      • + + + + + + + + +
        xx
        +
      • +
      +

      x|x

      +```````````````````````````````` +(Here the underline has the right indentation so the table is detected. +But the last line is not part of it due its indentation.) + + +### [Issue 42](https://github.com/mity/md4c/issues/42) + +```````````````````````````````` example +] http://x.x *x* + +|x|x| +|---|---| +|x| +. +

      ] http://x.x x

      + + + + + + + + + + + + + +
      xx
      x
      +```````````````````````````````` + + +### [Issue 104](https://github.com/mity/md4c/issues/104) + +```````````````````````````````` example +A | B +--- | --- +[x](url) +. + + + + + + + + + + + + + +
      AB
      x
      +```````````````````````````````` diff --git a/test/tasklists.txt b/test/tasklists.txt new file mode 100644 index 0000000..aae1bf8 --- /dev/null +++ b/test/tasklists.txt @@ -0,0 +1,117 @@ + +# Tasklists + +With the flag `MD_FLAG_TASKLISTS`, MD4C enables extension for recognition of +task lists. + +Basic task list may look as follows: + +```````````````````````````````` example + * [x] foo + * [X] bar + * [ ] baz +. +
        +
      • foo
      • +
      • bar
      • +
      • baz
      • +
      +```````````````````````````````` + +Task lists can also be in ordered lists: + +```````````````````````````````` example + 1. [x] foo + 2. [X] bar + 3. [ ] baz +. +
        +
      1. foo
      2. +
      3. bar
      4. +
      5. baz
      6. +
      +```````````````````````````````` + +Task lists can also be nested in ordinary lists: + +```````````````````````````````` example + * xxx: + * [x] foo + * [x] bar + * [ ] baz + * yyy: + * [ ] qux + * [x] quux + * [ ] quuz +. +
        +
      • xxx: +
          +
        • foo
        • +
        • bar
        • +
        • baz
        • +
      • +
      • yyy: +
          +
        • qux
        • +
        • quux
        • +
        • quuz
        • +
      • +
      +```````````````````````````````` + +Or in a parent task list: + +```````````````````````````````` example + 1. [x] xxx: + * [x] foo + * [x] bar + * [ ] baz + 2. [ ] yyy: + * [ ] qux + * [x] quux + * [ ] quuz +. +
        +
      1. xxx: +
          +
        • foo
        • +
        • bar
        • +
        • baz
        • +
      2. +
      3. yyy: +
          +
        • qux
        • +
        • quux
        • +
        • quuz
        • +
      4. +
      +```````````````````````````````` + +Also, ordinary lists can be nested in the task lists. + +```````````````````````````````` example + * [x] xxx: + * foo + * bar + * baz + * [ ] yyy: + * qux + * quux + * quuz +. +
        +
      • xxx: +
          +
        • foo
        • +
        • bar
        • +
        • baz
        • +
      • +
      • yyy: +
          +
        • qux
        • +
        • quux
        • +
        • quuz
        • +
      • +
      +```````````````````````````````` diff --git a/test/underline.txt b/test/underline.txt new file mode 100644 index 0000000..35e80b6 --- /dev/null +++ b/test/underline.txt @@ -0,0 +1,39 @@ + +# Underline + +With the flag `MD_FLAG_UNDERLINE`, MD4C sees underscore `_` rather as a mark +denoting an underlined span rather then an ordinary emphasis (or a strong +emphasis). + +```````````````````````````````` example +_foo_ +. +

      foo

      +```````````````````````````````` + +In sequences of multiple underscores, each single one translates into an +underline span mark. + +```````````````````````````````` example +___foo___ +. +

      foo

      +```````````````````````````````` + +Intra-word underscores are not recognized as underline marks: + +```````````````````````````````` example +foo_bar_baz +. +

      foo_bar_baz

      +```````````````````````````````` + +Also the parser follows the standard understanding when the underscore can +or cannot open or close a span. Therefore there is no underline in the following +example because no underline can be seen as a closing mark. + +```````````````````````````````` example +_foo _bar +. +

      _foo _bar

      +```````````````````````````````` diff --git a/test/wiki-links.txt b/test/wiki-links.txt new file mode 100644 index 0000000..c8afe71 --- /dev/null +++ b/test/wiki-links.txt @@ -0,0 +1,232 @@ + +# Wiki Links + +With the flag `MD_FLAG_WIKILINKS`, MD4C recognizes wiki links. + +The simple wiki-link is a wiki-link destination enclosed in `[[` followed with +`]]`. + +```````````````````````````````` example +[[foo]] +. +

      foo

      +```````````````````````````````` + +However wiki-link may contain an explicit label, delimited from the destination +with `|`. + +```````````````````````````````` example +[[foo|bar]] +. +

      bar

      +```````````````````````````````` + +A wiki-link destination cannot be empty. + +```````````````````````````````` example +[[]] +. +

      [[]]

      +```````````````````````````````` + +```````````````````````````````` example +[[|foo]] +. +

      [[|foo]]

      +```````````````````````````````` + + +The wiki-link destination cannot contain a new line. + +```````````````````````````````` example +[[foo +bar]] +. +

      [[foo +bar]]

      +```````````````````````````````` + +```````````````````````````````` example +[[foo +bar|baz]] +. +

      [[foo +bar|baz]]

      +```````````````````````````````` + +The wiki-link destination is rendered verbatim; inline markup in it is not +recognized. + +```````````````````````````````` example +[[*foo*]] +. +

      *foo*

      +```````````````````````````````` + +```````````````````````````````` example +[[foo|![bar](bar.jpg)]] +. +

      bar

      +```````````````````````````````` + +With multiple `|` delimiters, only the first one is recognized and the other +ones are part of the label. + +```````````````````````````````` example +[[foo|bar|baz]] +. +

      bar|baz

      +```````````````````````````````` + +However the delimiter `|` can be escaped with `/`. + +```````````````````````````````` example +[[foo\|bar|baz]] +. +

      baz

      +```````````````````````````````` + +The label can contain inline elements. + +```````````````````````````````` example +[[foo|*bar*]] +. +

      bar

      +```````````````````````````````` + +Empty explicit label is the same as using the implicit label; i.e. the verbatim +destination string is used as the label. + +```````````````````````````````` example +[[foo|]] +. +

      foo

      +```````````````````````````````` + +The label can span multiple lines. + +```````````````````````````````` example +[[foo|foo +bar +baz]] +. +

      foo +bar +baz

      +```````````````````````````````` + +Wiki-links have higher priority then links. + +```````````````````````````````` example +[[foo]](foo.jpg) +. +

      foo(foo.jpg)

      +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url + +[[foo]] +. +

      foo

      +```````````````````````````````` + +Wiki links can be inlined in tables. + +```````````````````````````````` example +| A | B | +|------------------|-----| +| [[foo|*bar*]] | baz | +. + + + + + + + + + + + + + +
      AB
      barbaz
      +```````````````````````````````` + +Wiki-links are not prioritized over images. + +```````````````````````````````` example +![[foo]](foo.jpg) +. +

      [foo]

      +```````````````````````````````` + +Something that may look like a wiki-link at first, but turns out not to be, +is recognized as a normal link. + +```````````````````````````````` example +[[foo] + +[foo]: /url +. +

      [foo

      +```````````````````````````````` + +Escaping the opening `[` escapes only that one character, not the whole `[[` +opener: + +```````````````````````````````` example +\[[foo]] + +[foo]: /url +. +

      [foo]

      +```````````````````````````````` + +Like with other inline links, the innermost wiki-link is preferred. + +```````````````````````````````` example +[[foo[[bar]]]] +. +

      [[foobar]]

      +```````````````````````````````` + +There is limit of 100 characters for the wiki-link destination. + +```````````````````````````````` example +[[12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901]] +[[12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901|foo]] +. +

      [[12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901]] +[[12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901|foo]]

      +```````````````````````````````` + +100 characters inside a wiki link target works. + +```````````````````````````````` example +[[1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890]] +[[1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890|foo]] +. +

      1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 +foo

      +```````````````````````````````` + +The limit on link content does not include any characters belonging to a block +quote, if the label spans multiple lines contained in a block quote. + +```````````````````````````````` example +> [[12345678901234567890123456789012345678901234567890|1234567890 +> 1234567890 +> 1234567890 +> 1234567890 +> 123456789]] +. +
      +

      1234567890 +1234567890 +1234567890 +1234567890 +123456789

      +
      +````````````````````````````````