# HWP5 본문 텍스트 추출 — OLE 스트림 → zlib 해제 → PARA_TEXT 레코드 파싱
import sys, zlib, struct
import olefile

EIGHT = {1,2,3,4,5,6,7,8,9,11,12,14,15,16,17,18,19,20,21,22,23}
ONE_NL = {10,13}
ONE_OTHER = {0,24,25,26,27,28,29,30,31}

def parse_para_text(data):
    out = []
    n = len(data)//2
    i = 0
    while i < n:
        c = struct.unpack_from('<H', data, i*2)[0]
        if c in EIGHT:
            i += 8; continue
        if c in ONE_NL:
            out.append('\n'); i += 1; continue
        if c in ONE_OTHER:
            i += 1; continue
        out.append(chr(c)); i += 1
    return ''.join(out)

def parse_section(raw):
    text = []
    i = 0
    while i + 4 <= len(raw):
        hdr = struct.unpack_from('<I', raw, i)[0]
        tag = hdr & 0x3FF
        size = (hdr >> 20) & 0xFFF
        i += 4
        if size == 0xFFF:
            size = struct.unpack_from('<I', raw, i)[0]
            i += 4
        data = raw[i:i+size]
        i += size
        if tag == 67:  # HWPTAG_PARA_TEXT
            text.append(parse_para_text(data))
    return '\n'.join(text)

f = sys.argv[1]
ole = olefile.OleFileIO(f)
streams = ole.listdir()
sections = sorted([s for s in streams if len(s) == 2 and s[0] == 'BodyText'], key=lambda x: x[1])
alltext = []
for s in sections:
    raw = ole.openstream(s).read()
    # HWP5 본문은 보통 zlib raw deflate 압축
    try:
        dec = zlib.decompress(raw, -15)
    except Exception:
        dec = raw
    alltext.append(parse_section(dec))
print('\n'.join(alltext))
