init

2025-12-11 02:09:57 +09:00
parent 6c80670b47
commit 8749de6aef
34 changed files with 2115 additions and 0 deletions
--- a/src/main/java/kr/co/ragone/service/ChunkingService.java
+++ b/src/main/java/kr/co/ragone/service/ChunkingService.java
@@ -0,0 +1,202 @@
+package kr.co.ragone.service;
+
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+@Slf4j
+@Service
+public class ChunkingService {
+
+    @Value("${rag.chunk.size:1000}")
+    private int chunkSize;
+
+    @Value("${rag.chunk.overlap:100}")
+    private int chunkOverlap;
+
+    // 최소 청크 크기 (이보다 작으면 단일 청크로)
+    private static final int MIN_CHUNK_SIZE = 50;
+
+    /**
+     * 텍스트를 청크로 분할
+     */
+    public List<ChunkResult> chunkText(String text) {
+        List<ChunkResult> chunks = new ArrayList<>();
+        
+        if (text == null || text.isBlank()) {
+            log.warn("빈 텍스트가 입력되었습니다.");
+            return chunks;
+        }
+
+        // 텍스트 정규화
+        text = normalizeText(text);
+        
+        log.info("청킹 시작: 원본 {}자, 청크 크기 {}, 오버랩 {}", 
+                text.length(), chunkSize, chunkOverlap);
+
+        // 텍스트가 짧으면 단일 청크로
+        if (text.length() <= chunkSize) {
+            chunks.add(createChunk(text, 0));
+            log.info("텍스트가 짧아 단일 청크로 생성: {}자", text.length());
+            return chunks;
+        }
+
+        // 문장 단위로 분할 후 청크 구성
+        List<String> sentences = splitIntoSentences(text);
+        log.debug("문장 {} 개로 분할됨", sentences.size());
+        
+        StringBuilder currentChunk = new StringBuilder();
+        int chunkIndex = 0;
+
+        for (String sentence : sentences) {
+            // 현재 청크에 문장 추가 시 크기 초과하면 저장
+            if (currentChunk.length() + sentence.length() > chunkSize && currentChunk.length() >= MIN_CHUNK_SIZE) {
+                chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex++));
+                
+                // 오버랩 처리
+                String overlap = getOverlapText(currentChunk.toString());
+                currentChunk = new StringBuilder(overlap);
+            }
+
+            if (currentChunk.length() > 0 && !currentChunk.toString().endsWith(" ")) {
+                currentChunk.append(" ");
+            }
+            currentChunk.append(sentence.trim());
+        }
+
+        // 마지막 청크 저장
+        if (currentChunk.length() >= MIN_CHUNK_SIZE) {
+            chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex));
+        } else if (currentChunk.length() > 0 && !chunks.isEmpty()) {
+            // 너무 짧으면 이전 청크에 병합
+            ChunkResult lastChunk = chunks.get(chunks.size() - 1);
+            String merged = lastChunk.getContent() + " " + currentChunk.toString().trim();
+            chunks.set(chunks.size() - 1, createChunk(merged, lastChunk.getIndex()));
+        } else if (currentChunk.length() > 0) {
+            chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex));
+        }
+
+        log.info("청킹 완료: {} 청크 생성", chunks.size());
+        return chunks;
+    }
+
+    /**
+     * 텍스트 정규화
+     */
+    private String normalizeText(String text) {
+        return text
+                // 연속 공백 제거
+                .replaceAll("[ \\t]+", " ")
+                // 연속 줄바꿈 정리
+                .replaceAll("\\n{2,}", "\n\n")
+                .trim();
+    }
+
+    /**
+     * 문장 단위로 분할
+     */
+    private List<String> splitIntoSentences(String text) {
+        List<String> sentences = new ArrayList<>();
+        
+        // 한국어/영어 문장 종결 패턴
+        // . ! ? 뒤에 공백이나 줄바꿈이 오는 경우
+        Pattern sentencePattern = Pattern.compile("(?<=[.!?。])\\s+|(?<=\\n)");
+        
+        String[] parts = sentencePattern.split(text);
+        
+        for (String part : parts) {
+            String trimmed = part.trim();
+            if (!trimmed.isEmpty()) {
+                sentences.add(trimmed);
+            }
+        }
+        
+        // 문장 분할이 잘 안되면 (문장이 1개인 경우) 단어 수 기준으로 분할
+        if (sentences.size() <= 1 && text.length() > chunkSize) {
+            sentences = splitByWords(text, chunkSize / 2);
+        }
+        
+        return sentences;
+    }
+
+    /**
+     * 단어 수 기준으로 분할 (문장 분할 실패 시 폴백)
+     */
+    private List<String> splitByWords(String text, int wordsPerChunk) {
+        List<String> chunks = new ArrayList<>();
+        String[] words = text.split("\\s+");
+        
+        StringBuilder current = new StringBuilder();
+        int wordCount = 0;
+        
+        for (String word : words) {
+            if (wordCount >= wordsPerChunk && current.length() > 0) {
+                chunks.add(current.toString().trim());
+                current = new StringBuilder();
+                wordCount = 0;
+            }
+            
+            if (current.length() > 0) {
+                current.append(" ");
+            }
+            current.append(word);
+            wordCount++;
+        }
+        
+        if (current.length() > 0) {
+            chunks.add(current.toString().trim());
+        }
+        
+        return chunks;
+    }
+
+    private ChunkResult createChunk(String content, int index) {
+        return ChunkResult.builder()
+                .content(content)
+                .index(index)
+                .tokenCount(estimateTokenCount(content))
+                .build();
+    }
+
+    private String getOverlapText(String text) {
+        if (text.length() <= chunkOverlap) {
+            return text;
+        }
+        // 단어 경계에서 자르기
+        String overlap = text.substring(text.length() - chunkOverlap);
+        int spaceIndex = overlap.indexOf(' ');
+        if (spaceIndex > 0) {
+            overlap = overlap.substring(spaceIndex + 1);
+        }
+        return overlap;
+    }
+
+    private int estimateTokenCount(String text) {
+        // 대략적인 토큰 수 추정
+        int koreanChars = 0;
+        int otherChars = 0;
+        
+        for (char c : text.toCharArray()) {
+            if (Character.UnicodeScript.of(c) == Character.UnicodeScript.HANGUL) {
+                koreanChars++;
+            } else {
+                otherChars++;
+            }
+        }
+        
+        // 한글은 약 1.5자당 1토큰, 영어는 4자당 1토큰
+        return (int) (koreanChars / 1.5 + otherChars / 4);
+    }
+
+    @lombok.Data
+    @lombok.Builder
+    public static class ChunkResult {
+        private String content;
+        private int index;
+        private int tokenCount;
+    }
+}