gorean icon indicating copy to clipboard operation
gorean copied to clipboard

korean analyzer utility tools

daangn GitHub release Github all releases GitHub contributors MIT license

๐Ÿ‡ฐ๐Ÿ‡ท Gorean

golang native๋กœ ์ž‘์„ฑ๋œ ํ•œ๊ธ€ ๋ถ„์„ ์œ ํ‹ธ๋ฆฌํ‹ฐ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž…๋‹ˆ๋‹ค. ๊ธฐ๋ณธ์ ์œผ๋กœ ruby 'korean-string'๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ golang์œผ๋กœ ํฌํŒ…ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ด๋ฉฐ, ๊ทธ ์ด์™ธ ํ•œ๊ธ€ ๋ถ„์„ ์œ ํ‹ธ๋ฆฌํ‹ฐ ๋„๊ตฌ๋“ค์„ ์ค€๋น„ํ–ˆ์Šต๋‹ˆ๋‹ค. ํ•ด๋‹น ๋„๊ตฌ๋Š” ํ•œ๊ธ€๊ฒ€์ƒ‰์— ์‚ฌ์šฉ๋˜๋Š” ํ•œ๊ธ€๋ถ„์„ ์œ ํ‹ธ๋ฆฌํ‹ฐ๋ฅผ ๋ชจ์•„๋‘˜ ์˜ˆ์ •์ž…๋‹ˆ๋‹ค.

๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์˜ ์ฝ”๋“œ๋Š” bada-ie ๋‹˜์˜ ์ฝ”๋“œ๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ๊ตฌํ˜„ํ•˜์˜€์œผ๋ฉฐ, ์ž์„ธํ•œ ํ•œ๊ธ€ ์ธ์ฝ”๋”ฉ ๊ด€๋ จ ์ •๋ณด๋Š” w3c-hangul-i18n์—์„œ ์—ด๋žŒํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

๐Ÿ— Speed Cheat Sheet

package main

import (
	"fmt"
	"strings"

	"github.com/daangn/gorean"
)

func main() {
	s := "  ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ์ด-  "
	sk, err := gorean.Split(s, gorean.SplitOptBasic)
	if err != nil {
		fmt.Println(err)
	} else {
		fmt.Println(sk)
		// second parameter[onlyKorean]: gorean.SplitOptBasic
		// [[ ] [ ] [ใ„ธ ใ…— ใ…] [ใ…ƒ ใ… ใ…‡] [ใ„ฑ ใ… ใ„ฑ] [ใ…Ž ใ…] [ ] [5] [r] [i] [ใ„ฑ ใ…œ] [ใ…‡ ใ…ฃ] [-] [ ] [ ]]
		// second parameter[onlyKorean]: gorean.SplitOptGetOnlyKorean
		// [[ใ„ธ ใ…— ใ…] [ใ…ƒ ใ… ใ…‡] [ใ„ฑ ใ… ใ„ฑ] [ใ…Ž ใ…] [ใ„ฑ ใ…œ] [ใ…‡ ใ…ฃ]]
	}

	var jt1 []string
	var jt2 []string
	for _, tokens := range sk {
		// case: 1
		if character, err := gorean.JoinTokens(tokens); err != nil {
			fmt.Println(err)
			/*
				( ) has been out-ranged tokens for JoinKorean
				( ) has been out-ranged tokens for JoinKorean
				( ) has been out-ranged tokens for JoinKorean
				(5) has been out-ranged tokens for JoinKorean
				(r) has been out-ranged tokens for JoinKorean
				(i) has been out-ranged tokens for JoinKorean
				(-) has been out-ranged tokens for JoinKorean
				( ) has been out-ranged tokens for JoinKorean
				( ) has been out-ranged tokens for JoinKorean
			*/
		} else {
			jt1 = append(jt1, character)
		}

		// case: 2
		if gorean.IsAbleToComposeAlphabetsForSingleCharacter(tokens) {
			character, _ := gorean.JoinTokens(tokens)
			jt2 = append(jt2, character)
		} else {
			noneKoreanToken := gorean.FindNoneKoreanAlphabetsForSingleCharacter(tokens)
			// you should write to something for exception existing none korean tokens
			fmt.Printf("Error! positions [%v] at [%v]\n", tokens, noneKoreanToken)
			/*
				Error! positions [[ ]] at [[0]]
				Error! positions [[ ]] at [[0]]
				Error! positions [[ ]] at [[0]]
				Error! positions [[5]] at [[0]]
				Error! positions [[r]] at [[0]]
				Error! positions [[i]] at [[0]]
				Error! positions [[-]] at [[0]]
				Error! positions [[ ]] at [[0]]
				Error! positions [[ ]] at [[0]]
			*/
		}
	}
	fmt.Printf("jt1 output => %s\n", strings.Join(jt1, "")) // ๋˜ ๋นต๊ฐํ•˜๊ตฌ์ด
	fmt.Printf("jt2 output => %s\n", strings.Join(jt2, "")) // ๋˜ ๋นต๊ฐํ•˜๊ตฌ์ด

	if edgeNGram, err := gorean.GenerateEdgeNGramTokens(s); err != nil {
		fmt.Println(err)
	} else {
		fmt.Println(edgeNGram)
		/*
			// Warning: Including whitespace each items at []string, It didn't Trim
			[       ใ„ท   ใ„ธ   ๋˜   ๋˜    ๋˜ ใ…‚   ๋˜ ใ…ƒ   ๋˜ ๋น    ๋˜ ๋นต   ๋˜ ๋นตใ„ฑ   ๋˜ ๋นต๊ฐ€   ๋˜ ๋นต๊ฐ   ๋˜ ๋นต๊ฐใ…Ž   ๋˜ ๋นต๊ฐํ•˜   ๋˜ ๋นต๊ฐํ•˜    ๋˜ ๋นต๊ฐํ•˜ 5   ๋˜ ๋นต๊ฐํ•˜ 5r   ๋˜ ๋นต๊ฐํ•˜ 5ri   ๋˜ ๋นต๊ฐํ•˜ 5riใ„ฑ   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ถ   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌใ…‡   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ์ด   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ์ด-   ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ์ด-    ๋˜ ๋นต๊ฐํ•˜ 5ri๊ตฌ์ด-  ]
		*/
	}

	messKoreanSort := []string{
		"ํ•˜๊ธฐ์Šค",
		"๊น€์น˜๋ณถ์Œ๋ฐฅ",
		"์‚ฌ์ž์™•์™•",
		"์ž๋ฃจ์†Œ๋ฐ”์˜ค์ด์‹œ",
		"์™•์ดˆ",
		"๋ฐฅ์ƒ๋จธ๋ฆฌ",
		"๊นŒ์น˜๊พธ์น˜",
		"๋งˆ์žฅ๋™",
		"๋™๋ฐฑ",
	}
	gorean.Sort(messKoreanSort, gorean.SortOptAsc)
	fmt.Println(messKoreanSort) // [๊น€์น˜๋ณถ์Œ๋ฐฅ ๊นŒ์น˜๊พธ์น˜ ๋™๋ฐฑ ๋งˆ์žฅ๋™ ๋ฐฅ์ƒ๋จธ๋ฆฌ ์‚ฌ์ž์™•์™• ์™•์ดˆ ์ž๋ฃจ์†Œ๋ฐ”์˜ค์ด์‹œ ํ•˜๊ธฐ์Šค]

	koreanWithEnglish := "์ดˆ์„ฑํ€ด์ฆˆ with English"
	korean := gorean.Korean(koreanWithEnglish, 10)
	fmt.Println(korean) // ์ดˆ์„ฑํ€ด์ฆˆ

	chosung, _ := gorean.Chosung(strings.Join(korean, " "))
	fmt.Println(strings.Join(chosung, "")) // ใ…Šใ……ใ…‹ใ…ˆ
}

๐Ÿฑ API Summary

- func gorean.Split([]string, SplitOpt)
    - type SplitOpt gorean.SplitOptBasic
    - type SplitOpt gorean.SplitOptGetOnlyKorean
- func gorean.JoinTokens([]string) // 2 <= len(ary) <= 3
- func gorean.IsAbleToComposeAlphabetsForSingleCharacter([]string) // 2 <= len(ary) <= 3
- func gorean.FindNoneKoreanAlphabetsForSingleCharacter([]string) // 2 <= len(ary) <= 3
- func gorean.GenerateEdgeNGramTokens(string)
- func gorean.Sort([]string, SortOpt)
    - type SortOpt gorean.SortOptAsc
    - type SortOpt gorean.SortOptDesc

gorean.Split

  • ์ž…๋ ฅ๊ฐ’์œผ๋กœ ์žฅ๋ฌธ์˜ ๋ฌธ์ž์—ด์„ ๋ฐ›์„ ์ˆ˜ ์žˆ์œผ๋ฉฐ, ๊ฐ๊ฐ ํ•œ๊ธ€์ž์— ๋”ฐ๋ฅธ 2๊ฐœ~3๊ฐœ์˜ elements๋กœ ๋˜์–ด์žˆ๋Š” ์ž๋ชจ ๋ฐฐ์—ด์ด ๋‚˜์˜ค๊ฒŒ ๋˜๋ฉฐ, ๊ฒฐ๊ณผ๊ฐ’์€ ์ด์ค‘๋ฐฐ์—ด์ด ๋‚˜์˜ค๊ฒŒ ๋œ๋‹ค.

gorean.JoinTokens

  • ์ž…๋ ฅ๊ฐ’์œผ๋กœ ์ฃผ์–ด์งˆ ๋ฐฐ์—ด 2๊ฐœ~3๊ฐœ์˜ string๋“ค์ด ๋˜์–ด์žˆ๋Š” ๋ฐฐ์—ด์„ ์กฐํ•ฉํ•ด์„œ ํ•˜๋‚˜์˜ ํ•œ๊ธ€ ์กฐํ•ฉ๋ฌธ์ž์—ด์„ ๋งŒ๋“ ๋‹ค.

gorean.IsAbleToComposeAlphabetsForSingleCharacter

  • ์ž…๋ ฅ๊ฐ’์œผ๋กœ ์ฃผ์–ด์งˆ ๋ฐฐ์—ด 2๊ฐœ~3๊ฐœ์˜ string๋“ค์ด ๋˜์–ด์žˆ๋Š” ๋ฐฐ์—ด์ด ํ•œ๊ตญ์–ด ์กฐํ•ฉ๊ธ€์ž๊ฐ€ ๋  ์ˆ˜ ์žˆ๋Š”์ง€ ์— ๋Œ€ํ•œ ๊ฒ€์ฆ์ฝ”๋“œ
  • ๋””๋ฒ„๊น… ๋ชฉ์ ์œผ๋กœ ๋งŒ๋“ฌ, JoinTokens๋ฅผ ํ•˜๊ธฐ ์ด์ „์— ์ฒดํฌํ•ด๋ณด๊ณ  ๋„˜์–ด ๊ฐ€๋ผ๊ณ  ๋งŒ๋“ฌ

gorean.FindNoneKoreanAlphabetsForSingleCharacter

  • ์ž…๋ ฅ๊ฐ’์œผ๋กœ ์ฃผ์–ด์งˆ ๋ฐฐ์—ด 2๊ฐœ~3๊ฐœ์˜ string๋“ค์ด ๋˜์–ด์žˆ๋Š” ๋ฐฐ์—ด์— ํ•œ๊ธ€ ์ž๋ชจ๊ฐ€ ์•„๋‹Œ ๊ธ€์ž๊ฐ€ ํฌํ•จ๋˜์–ด์žˆ๋Š”์ง€ ํ™•์ธํ•˜๋Š” ๋””๋ฒ„๊น… ์ฝ”๋“œ
  • ๋””๋ฒ„๊น… ๋ชฉ์ ์œผ๋กœ ๋งŒ๋“ฌ, IsAbleToComposeAlphabetsForSingleCharacter์—์„œ false์ผ ๋•Œ์— ์‚ฌ์šฉํ•˜๋„๋ก ์˜๋„

gorean.GenerateEdgeNGramTokens

  • ํ•œ๊ธ€ EdgeNGram ๋ฐ ์ „๋ฐฉ์ผ์น˜ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์–ป๊ธฐ์œ„ํ•ด ๋งŒ๋“ฌ
  • ๊ฐ•๋‚จ์—ญ => [ใ„ฑ,๊ฐ€,๊ฐ•,๊ฐ•ใ„ด,๊ฐ•๋‚˜,๊ฐ•๋‚จ,๊ฐ•๋‚จใ…‡,๊ฐ•๋‚จ์—ฌ,๊ฐ•๋‚จ์—ญ]

gorean.Sort

  • ๋ฌธ์ž์—ด ์ •๋ ฌ์„ ์œ„ํ•ด ์กด์žฌํ•จ. ์ •๋ ฌ์˜ ๋‹ค์–‘ํ•œ ์˜ต์…˜ ์ œ๊ณต

๐Ÿ“ Release note

  • v0.0.5 [Latest Release]
    1. ๊ธฐ๋Šฅ์ถ”๊ฐ€ ์ดˆ์„ฑ ์–ป๊ธฐ, Chosung()
    2. ๊ธฐ๋Šฅ์ถ”๊ฐ€ ํ•œ๊ธ€ ์ฐพ๊ธฐ, Korean()
  • v0.0.6 [ToDo]
    1. ๊ธฐ๋Šฅ์ถ”๊ฐ€ ์˜์–ด์žํŒ to ํ•œ๊ธ€
    2. ๊ธฐ๋Šฅ์ถ”๊ฐ€ ํ•œ๊ธ€์žํŒ to ์˜์–ด์žํŒ
  • v0.0.7
    1. ๊ธฐ๋Šฅ์—…๊ทธ๋ ˆ์ด๋“œ: ํ•œ๊ธ€ ์ฐพ๊ธฐ, (flash-text)[https://medium.com/@jwyeom63/%EB%B2%88%EC%97%AD-%EC%A0%95%EA%B7%9C%ED%91%9C%ED%98%84%EC%8B%9D%EC%9C%BC%EB%A1%9C-5%EC%9D%BC-%EA%B1%B8%EB%A6%AC%EB%8A%94-%EC%9E%91%EC%97%85-15%EB%B6%84%EB%A7%8C%EC%97%90-%EB%81%9D%EB%82%B4%EA%B8%B0-2e615a907048] ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ด์šฉํ•ด ํ•œ๊ธ€์ฐพ๊ธฐ ๊ฐœ์„ 
  • v0.1.1 [ToDo]
    1. benchmark ๊ฒ€์ฆ ๋ฐ ์„ฑ๋Šฅ ์ตœ์ ํ™”

๐Ÿ‘ Contribute

  • @drakejin ์€ ๋ถ€์กฑํ•œ๊ฒŒ ๋งŽ์Šต๋‹ˆ๋‹ค. ํ”ผ๋“œ๋ฐฑ์€ ์–ธ์ œ๋‚˜ ํ™˜์˜์ด์—์š”.
  • ๋ญ”๊ฐ€ ์ถ”๊ฐ€ ํ•ด์คฌ์œผ๋ฉด ํ•˜๋Š”๊ฒŒ ์žˆ๊ฑฐ๋‚˜, ๋ณ€๊ฒฝํ–ˆ์œผ๋ฉด ํ•˜๋Š”๊ฒŒ ์žˆ์œผ๋ฉด ์–ธ์ œ๋“ ์ง€ ์ด์Šˆ ๋ฐ PR ๋ณด๋‚ด์ฃผ์„ธ์š”. ํŠน๋ณ„ํ•œ ์ผ ์—†์œผ๋ฉด 2์ผ์•ˆ์— ๋ฐ˜์˜ํ•ด๋“œ๋ฆด๊ฒŒ์š”.

๋งŒ์•ฝ ์ด ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ daangn/gorean์„ ์ž˜ ์‚ฌ์šฉํ•˜์…จ๋‹ค๋ฉด...

  1. GitHub Star
  2. ์ด์ง๊ฐ์„ ์žฌ๊ณ  ๊ณ„์‹œ๋‹ค๋ฉด ๐Ÿฅ•๋‹น๊ทผ๋งˆ์ผ“๐Ÿฅ•