Spaces:

soiz1
/

s4s-editor

Running

s4s-editor / detection_ja.sh

Update detection_ja.sh

d3cce16 verified 15 days ago

1.28 kB

	#!/bin/bash

	# 対象拡張子
	extensions="js\|mjs\|jsx\|json\|txt\|html\|htm\|json5\|md\|css"

	# 一時ファイル
	TMP_RESULT=$(mktemp)

	find . -type f -regextype posix-extended -regex ".*\.($extensions)$" \| while read -r file; do
	encoding=$(file -bi "$file" \| sed -n 's/.charset=$.$$/\1/p')

	if ! iconv -f "$encoding" -t UTF-8 "$file" > /dev/null 2>&1; then
	continue
	fi

	content=$(iconv -f "$encoding" -t UTF-8 "$file")

	matches=$(echo "$content" \| perl -CS -ne '
	while (/([\p{Hiragana}\p{Katakana}\p{Han}a-zA-Z0-9ー]+(?:[\p{Hiragana}\p{Katakana}\p{Han}a-zA-Z0-9ー]+)*)/g) {
	my $word = $1;
	# 日本語を含むものだけ出力
	print "$word\n" if $word =~ /[\p{Hiragana}\p{Katakana}\p{Han}]/;
	}
	' \| sort \| uniq)

	if [[ -n "$matches" ]]; then
	echo "\"$file\": {" >> "$TMP_RESULT"
	echo "$matches" \| while read -r line; do
	escaped=$(echo "$line" \| sed 's/"/\\"/g')
	echo " \"$escaped\": []," >> "$TMP_RESULT"
	done
	sed -i '$ s/,$//' "$TMP_RESULT"
	echo "}," >> "$TMP_RESULT"
	fi
	done

	# JSON出力
	if [[ -s "$TMP_RESULT" ]]; then
	echo "{"
	sed '$ s/,$//' "$TMP_RESULT"
	echo "}"
	else
	echo "{}"
	fi

	rm "$TMP_RESULT"