ragone-backend/src/main/java/kr/co/ragone/service/ChunkingService.java

package kr.co.ragone.service;

import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

@Slf4j
@Service
public class ChunkingService {

    @Value("${rag.chunk.size:1000}")
    private int chunkSize;

    @Value("${rag.chunk.overlap:100}")
    private int chunkOverlap;

    // 최소 청크 크기 (이보다 작으면 단일 청크로)
    private static final int MIN_CHUNK_SIZE = 50;

    /**
     * 텍스트를 청크로 분할
     */
    public List<ChunkResult> chunkText(String text) {
        List<ChunkResult> chunks = new ArrayList<>();

        if (text == null || text.isBlank()) {
            log.warn("빈 텍스트가 입력되었습니다.");
            return chunks;
        }

        // 텍스트 정규화
        text = normalizeText(text);

        log.info("청킹 시작: 원본 {}자, 청크 크기 {}, 오버랩 {}",
                text.length(), chunkSize, chunkOverlap);

        // 텍스트가 짧으면 단일 청크로
        if (text.length() <= chunkSize) {
            chunks.add(createChunk(text, 0));
            log.info("텍스트가 짧아 단일 청크로 생성: {}자", text.length());
            return chunks;
        }

        // 문장 단위로 분할 후 청크 구성
        List<String> sentences = splitIntoSentences(text);
        log.debug("문장 {} 개로 분할됨", sentences.size());

        StringBuilder currentChunk = new StringBuilder();
        int chunkIndex = 0;

        for (String sentence : sentences) {
            // 현재 청크에 문장 추가 시 크기 초과하면 저장
            if (currentChunk.length() + sentence.length() > chunkSize && currentChunk.length() >= MIN_CHUNK_SIZE) {
                // 의미없는 청크는 건너뛰기
                if (!isUselessChunk(currentChunk.toString())) {
                    chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex++));
                } else {
                    log.debug("목차/표지 청크 건너뛰기: {}", currentChunk.toString().substring(0, Math.min(50, currentChunk.length())));
                }

                // 오버랩 처리
                String overlap = getOverlapText(currentChunk.toString());
                currentChunk = new StringBuilder(overlap);
            }

            if (currentChunk.length() > 0 && !currentChunk.toString().endsWith(" ")) {
                currentChunk.append(" ");
            }
            currentChunk.append(sentence.trim());
        }

        // 마지막 청크 저장
        if (currentChunk.length() >= MIN_CHUNK_SIZE && !isUselessChunk(currentChunk.toString())) {
            chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex));
        } else if (currentChunk.length() > 0 && !chunks.isEmpty() && !isUselessChunk(currentChunk.toString())) {
            // 너무 짧으면 이전 청크에 병합
            ChunkResult lastChunk = chunks.get(chunks.size() - 1);
            String merged = lastChunk.getContent() + " " + currentChunk.toString().trim();
            chunks.set(chunks.size() - 1, createChunk(merged, lastChunk.getIndex()));
        } else if (currentChunk.length() > 0) {
            chunks.add(createChunk(currentChunk.toString().trim(), chunkIndex));
        }

        log.info("청킹 완료: {} 청크 생성", chunks.size());
        return chunks;
    }

    /**
     * 텍스트 정규화
     */
    private String normalizeText(String text) {
        return text
                // 연속 공백 제거
                .replaceAll("[ \\t]+", " ")
                // 연속 줄바꿈 정리
                .replaceAll("\\n{2,}", "\n\n")
                .trim();
    }

    /**
     * 의미없는 청크인지 확인 (목차, 표지 등)
     */
    private boolean isUselessChunk(String content) {
        if (content == null || content.length() < 30) {
            return true;
        }

        // 목차/표지 페이지 필터링
        String lower = content.toLowerCase();
        if (lower.contains("목차 페이지입니다") ||
            lower.contains("표지 페이지입니다") ||
            lower.contains("개정이력 페이지입니다")) {
            return true;
        }

        // 점선(...)이 너무 많으면 목차
        int dotCount = content.split("\\.\\.\\.").length - 1;
        if (dotCount > 3) {
            return true;
        }

        return false;
    }

    /**
     * 문장 단위로 분할
     */
    private List<String> splitIntoSentences(String text) {
        List<String> sentences = new ArrayList<>();

        // 한국어/영어 문장 종결 패턴
        // . ! ? 뒤에 공백이나 줄바꿈이 오는 경우
        Pattern sentencePattern = Pattern.compile("(?<=[.!?。])\\s+|(?<=\\n)");

        String[] parts = sentencePattern.split(text);

        for (String part : parts) {
            String trimmed = part.trim();
            if (!trimmed.isEmpty()) {
                sentences.add(trimmed);
            }
        }

        // 문장 분할이 잘 안되면 (문장이 1개인 경우) 단어 수 기준으로 분할
        if (sentences.size() <= 1 && text.length() > chunkSize) {
            sentences = splitByWords(text, chunkSize / 2);
        }

        return sentences;
    }

    /**
     * 단어 수 기준으로 분할 (문장 분할 실패 시 폴백)
     */
    private List<String> splitByWords(String text, int wordsPerChunk) {
        List<String> chunks = new ArrayList<>();
        String[] words = text.split("\\s+");

        StringBuilder current = new StringBuilder();
        int wordCount = 0;

        for (String word : words) {
            if (wordCount >= wordsPerChunk && current.length() > 0) {
                chunks.add(current.toString().trim());
                current = new StringBuilder();
                wordCount = 0;
            }

            if (current.length() > 0) {
                current.append(" ");
            }
            current.append(word);
            wordCount++;
        }

        if (current.length() > 0) {
            chunks.add(current.toString().trim());
        }

        return chunks;
    }

    private ChunkResult createChunk(String content, int index) {
        return ChunkResult.builder()
                .content(content)
                .index(index)
                .tokenCount(estimateTokenCount(content))
                .build();
    }

    private String getOverlapText(String text) {
        if (text.length() <= chunkOverlap) {
            return text;
        }
        // 단어 경계에서 자르기
        String overlap = text.substring(text.length() - chunkOverlap);
        int spaceIndex = overlap.indexOf(' ');
        if (spaceIndex > 0) {
            overlap = overlap.substring(spaceIndex + 1);
        }
        return overlap;
    }

    private int estimateTokenCount(String text) {
        // 대략적인 토큰 수 추정
        int koreanChars = 0;
        int otherChars = 0;

        for (char c : text.toCharArray()) {
            if (Character.UnicodeScript.of(c) == Character.UnicodeScript.HANGUL) {
                koreanChars++;
            } else {
                otherChars++;
            }
        }

        // 한글은 약 1.5자당 1토큰, 영어는 4자당 1토큰
        return (int) (koreanChars / 1.5 + otherChars / 4);
    }

    @lombok.Data
    @lombok.Builder
    public static class ChunkResult {
        private String content;
        private int index;
        private int tokenCount;
    }
}