Unicode normalization
Unicode supports equivalence meaning that the same character glyph can be encoded in various ways.
Example:
Source: Unicode Annex #15: Unicode Normalization Forms
To solve this issue normalization is performed. Normalization is conversion of all characters in a given set to one of ther normalization forms. The best source is Annex #15 of Unicode specs.
There are four normalization forms:
Source: Unicode Annex #15: Unicode Normalization Forms
Normalization gist
Simple Deno code to normalize paths in a dir:
import * as path from "https://deno.land/std@0.139.0/path/mod.ts";
const src = "path/to/dir/to/normalize"
const filepathsAbsolute = await getAllFiles(src, { filter: skipHiddenFilesFilter })
const filepathsRelative = filepathsAbsolute.map(fPath => path.relative(path.resolve(src), fPath))
filepathsRelative.sort()
for (const fPath of filepathsRelative) {
const normalized = fPath.normalize("NFC")
if (fPath !== normalized) {
const mvSrc = `${src}/${fPath}`;
const mvDst = `${src}/${normalized}`;
Deno.renameSync(mvSrc, mvDst)
console.log(`Normalized: ${normalized}`)
}
}
async function getAllFiles(
currentPath: string,
o: { filter: (e: Deno.DirEntry) => boolean }
) {
const paths = await _getAllFiles(currentPath, o)
return paths.map(path => Deno.realPathSync(path))
}
const skipHiddenFilesFilter = (e: Deno.DirEntry) => !e.name.startsWith(".")
async function _getAllFiles(
currentPath: string,
o: { filter: (e: Deno.DirEntry) => boolean }
) {
const names: string[] = [];
for await (const dirEntry of Deno.readDir(currentPath)) {
if (!o.filter(dirEntry)) continue;
const entryPath = `${currentPath}/${dirEntry.name}`;
names.push(entryPath);
if (dirEntry.isDirectory) {
names.push(...(await getAllFiles(entryPath, o)));
}
}
return names;
}