Spaces:

soiz1
/

s4s-editor

Build error

s4s-editor / detection_ja.sh

Update detection_ja.sh

95dd954 verified about 1 month ago

1.67 kB

	#!/bin/bash

	# 対象拡張子
	extensions="js\|mjs\|jsx\|json\|txt\|html\|htm\|json5\|md\|css"

	# 一時ファイル
	TMP_RESULT=$(mktemp)

	# エンコーディングを自動判定して UTF-8 に変換して処理
	find . -type f -regextype posix-extended -regex ".*\.($extensions)$" \| while read -r file; do
	# 自動判定（fileコマンドで文字コード取得）
	encoding=$(file -bi "$file" \| sed -n 's/.charset=$.$$/\1/p')

	# UTF-8 に変換できなければスキップ
	if ! iconv -f "$encoding" -t UTF-8 "$file" > /dev/null 2>&1; then
	continue
	fi

	# UTF-8に変換して内容読み込み
	content=$(iconv -f "$encoding" -t UTF-8 "$file")

	# 日本語と英語が混ざっていても処理し、空白や記号で分割
	matches=$(echo "$content" \| \
	grep -oP '[^\s[:cntrl:]][\p{Hiragana}\p{Katakana}\p{Han}a-zA-Z][\p{Hiragana}\p{Katakana}\p{Han}]+[^\s[:cntrl:]]*' \| \
	perl -CSD -pe '
	s/([^\p{Han}\p{Hiragana}\p{Katakana}a-zA-Z]+)/\n/g;
	s/^\s+\|\s+$//g;
	' \| grep -P '[\p{Hiragana}\p{Katakana}\p{Han}]' \| sort \| uniq)

	# 結果がある場合だけ出力に追加
	if [[ -n "$matches" ]]; then
	echo "\"$file\": {" >> "$TMP_RESULT"
	echo "$matches" \| while read -r line; do
	escaped=$(echo "$line" \| sed 's/"/\\"/g')
	echo " \"$escaped\": []," >> "$TMP_RESULT"
	done
	sed -i '$ s/,$//' "$TMP_RESULT"
	echo "}," >> "$TMP_RESULT"
	fi
	done

	# JSON形式として出力
	if [[ -s "$TMP_RESULT" ]]; then
	echo "{"
	sed '$ s/,$//' "$TMP_RESULT"
	echo "}"
	else
	echo "{}"
	fi

	# 後始末
	rm "$TMP_RESULT"