diff --git a/src/main/java/notai/document/application/DocumentService.java b/src/main/java/notai/document/application/DocumentService.java index 1693b4f..7ebf002 100644 --- a/src/main/java/notai/document/application/DocumentService.java +++ b/src/main/java/notai/document/application/DocumentService.java @@ -9,6 +9,9 @@ import notai.document.presentation.request.DocumentUpdateRequest; import notai.folder.domain.Folder; import notai.folder.domain.FolderRepository; +import notai.ocr.application.OCRService; +import notai.pdf.PdfService; +import notai.pdf.result.PdfSaveResult; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; @@ -17,28 +20,26 @@ public class DocumentService { private final PdfService pdfService; + private final OCRService ocrService; private final DocumentRepository documentRepository; private final FolderRepository folderRepository; public DocumentSaveResult saveDocument( Long folderId, MultipartFile pdfFile, DocumentSaveRequest documentSaveRequest ) { - String pdfName = pdfService.savePdf(pdfFile); - String pdfUrl = convertPdfUrl(pdfName); - Folder folder = folderRepository.getById(folderId); - Document document = new Document(folder, documentSaveRequest.name(), pdfUrl); - Document savedDocument = documentRepository.save(document); - return DocumentSaveResult.of(savedDocument.getId(), savedDocument.getName(), savedDocument.getUrl()); + PdfSaveResult pdfSaveResult = pdfService.savePdf(pdfFile); + Document document = saveAndReturnDocument(folderId, documentSaveRequest, pdfSaveResult.pdfUrl()); + ocrService.saveOCR(document, pdfSaveResult.pdf()); + return DocumentSaveResult.of(document.getId(), document.getName(), document.getUrl()); } public DocumentSaveResult saveRootDocument( MultipartFile pdfFile, DocumentSaveRequest documentSaveRequest ) { - String pdfName = pdfService.savePdf(pdfFile); - String pdfUrl = convertPdfUrl(pdfName); - Document document = new Document(documentSaveRequest.name(), pdfUrl); - Document savedDocument = documentRepository.save(document); - return DocumentSaveResult.of(savedDocument.getId(), savedDocument.getName(), savedDocument.getUrl()); + PdfSaveResult pdfSaveResult = pdfService.savePdf(pdfFile); + Document document = saveAndReturnRootDocument(documentSaveRequest, pdfSaveResult.pdfUrl()); + ocrService.saveOCR(document, pdfSaveResult.pdf()); + return DocumentSaveResult.of(document.getId(), document.getName(), document.getUrl()); } public DocumentUpdateResult updateDocument( @@ -65,7 +66,14 @@ public void deleteAllByFolder( documentRepository.deleteAllByFolder(folder); } - private String convertPdfUrl(String pdfName) { - return String.format("pdf/%s", pdfName); + private Document saveAndReturnDocument(Long folderId, DocumentSaveRequest documentSaveRequest, String pdfUrl) { + Folder folder = folderRepository.getById(folderId); + Document document = new Document(folder, documentSaveRequest.name(), pdfUrl); + return documentRepository.save(document); + } + + private Document saveAndReturnRootDocument(DocumentSaveRequest documentSaveRequest, String pdfUrl) { + Document document = new Document(documentSaveRequest.name(), pdfUrl); + return documentRepository.save(document); } } diff --git a/src/main/java/notai/document/presentation/DocumentController.java b/src/main/java/notai/document/presentation/DocumentController.java index 2356163..5b5610f 100644 --- a/src/main/java/notai/document/presentation/DocumentController.java +++ b/src/main/java/notai/document/presentation/DocumentController.java @@ -28,6 +28,9 @@ public class DocumentController { private final DocumentService documentService; private final DocumentQueryService documentQueryService; + private static final Long ROOT_FOLDER_ID = -1L; + private static final String FOLDER_URL_FORMAT = "/api/folders/%s/documents/%s"; + @PostMapping(consumes = {MediaType.MULTIPART_FORM_DATA_VALUE}) public ResponseEntity saveDocument( @PathVariable Long folderId, @@ -35,13 +38,13 @@ public ResponseEntity saveDocument( @RequestPart DocumentSaveRequest documentSaveRequest ) { DocumentSaveResult documentSaveResult; - if (folderId.equals(-1L)) { + if (folderId.equals(ROOT_FOLDER_ID)) { documentSaveResult = documentService.saveRootDocument(pdfFile, documentSaveRequest); } else { documentSaveResult = documentService.saveDocument(folderId, pdfFile, documentSaveRequest); } DocumentSaveResponse response = DocumentSaveResponse.from(documentSaveResult); - String url = String.format("/api/folders/%s/documents/%s", folderId, response.id()); + String url = String.format(FOLDER_URL_FORMAT, folderId, response.id()); return ResponseEntity.created(URI.create(url)).body(response); } diff --git a/src/main/java/notai/ocr/application/OCRService.java b/src/main/java/notai/ocr/application/OCRService.java new file mode 100644 index 0000000..ee63f5e --- /dev/null +++ b/src/main/java/notai/ocr/application/OCRService.java @@ -0,0 +1,50 @@ +package notai.ocr.application; + +import lombok.RequiredArgsConstructor; +import net.sourceforge.tess4j.Tesseract; +import notai.common.exception.type.FileProcessException; +import notai.document.domain.Document; +import notai.ocr.domain.OCR; +import notai.ocr.domain.OCRRepository; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.awt.image.BufferedImage; +import java.io.File; + +@Service +@RequiredArgsConstructor +public class OCRService { + + private final OCRRepository ocrRepository; + + @Async + public void saveOCR( + Document document, File pdfFile + ) { + try { + System.setProperty("jna.library.path", "/usr/local/opt/tesseract/lib/"); + //window, mac -> brew install tesseract, tesseract-lang + Tesseract tesseract = new Tesseract(); + + tesseract.setDatapath("/usr/local/share/tessdata"); + tesseract.setLanguage("kor+eng"); + + PDDocument pdDocument = Loader.loadPDF(pdfFile); + PDFRenderer pdfRenderer = new PDFRenderer(pdDocument); + for (int i = 0; i < pdDocument.getNumberOfPages(); i++) { + BufferedImage image = pdfRenderer.renderImage(i); + String ocrResult = tesseract.doOCR(image); + OCR ocr = new OCR(document, i + 1, ocrResult); + ocrRepository.save(ocr); + } + + pdDocument.close(); + } catch (Exception e) { + throw new FileProcessException("PDF 파일을 통해 OCR 작업을 수행하는데 실패했습니다."); + } + } +} diff --git a/src/main/java/notai/ocr/domain/OCR.java b/src/main/java/notai/ocr/domain/OCR.java new file mode 100644 index 0000000..cad27e8 --- /dev/null +++ b/src/main/java/notai/ocr/domain/OCR.java @@ -0,0 +1,39 @@ +package notai.ocr.domain; + +import jakarta.persistence.*; +import static jakarta.persistence.GenerationType.IDENTITY; +import jakarta.validation.constraints.NotNull; +import static lombok.AccessLevel.PROTECTED; +import lombok.Getter; +import lombok.NoArgsConstructor; +import notai.common.domain.RootEntity; +import notai.document.domain.Document; + +@Entity +@Table(name = "ocr") +@Getter +@NoArgsConstructor(access = PROTECTED) +public class OCR extends RootEntity { + + @Id + @GeneratedValue(strategy = IDENTITY) + private Long id; + + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "document_id", referencedColumnName = "id") + private Document document; + + @NotNull + @Column(name = "page_number") + private Integer pageNumber; + + @NotNull + @Column(name = "content", length = 255) + private String content; + + public OCR(Document document, Integer pageNumber, String content) { + this.document = document; + this.pageNumber = pageNumber; + this.content = content; + } +} diff --git a/src/main/java/notai/ocr/domain/OCRRepository.java b/src/main/java/notai/ocr/domain/OCRRepository.java new file mode 100644 index 0000000..144a12d --- /dev/null +++ b/src/main/java/notai/ocr/domain/OCRRepository.java @@ -0,0 +1,17 @@ +package notai.ocr.domain; + +import notai.common.exception.type.NotFoundException; +import notai.document.domain.Document; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.List; + +public interface OCRRepository extends JpaRepository { + default OCR getById(Long id) { + return findById(id).orElseThrow(() -> new NotFoundException("OCR 데이터를 찾을 수 없습니다.")); + } + + List findAllByDocumentId(Long documentId); + + void deleteAllByDocument(Document document); +} diff --git a/src/main/java/notai/document/presentation/PdfController.java b/src/main/java/notai/pdf/PdfController.java similarity index 92% rename from src/main/java/notai/document/presentation/PdfController.java rename to src/main/java/notai/pdf/PdfController.java index 0b70edf..9d58c78 100644 --- a/src/main/java/notai/document/presentation/PdfController.java +++ b/src/main/java/notai/pdf/PdfController.java @@ -1,7 +1,6 @@ -package notai.document.presentation; +package notai.pdf; import lombok.RequiredArgsConstructor; -import notai.document.application.PdfService; import org.springframework.core.io.FileSystemResource; import org.springframework.http.HttpHeaders; import org.springframework.http.MediaType; diff --git a/src/main/java/notai/document/application/PdfService.java b/src/main/java/notai/pdf/PdfService.java similarity index 88% rename from src/main/java/notai/document/application/PdfService.java rename to src/main/java/notai/pdf/PdfService.java index 74f887a..a4a33d8 100644 --- a/src/main/java/notai/document/application/PdfService.java +++ b/src/main/java/notai/pdf/PdfService.java @@ -1,8 +1,9 @@ -package notai.document.application; +package notai.pdf; import lombok.RequiredArgsConstructor; import notai.common.exception.type.FileProcessException; import notai.common.exception.type.NotFoundException; +import notai.pdf.result.PdfSaveResult; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; @@ -19,7 +20,7 @@ public class PdfService { private static final String STORAGE_DIR = "src/main/resources/pdf/"; - public String savePdf(MultipartFile file) { + public PdfSaveResult savePdf(MultipartFile file) { try { Path directoryPath = Paths.get(STORAGE_DIR); if (!Files.exists(directoryPath)) { @@ -30,7 +31,7 @@ public String savePdf(MultipartFile file) { Path filePath = directoryPath.resolve(fileName); file.transferTo(filePath.toFile()); - return fileName; + return PdfSaveResult.of(fileName, filePath.toFile()); } catch (IOException exception) { throw new FileProcessException("자료를 저장하는 과정에서 에러가 발생했습니다."); } diff --git a/src/main/java/notai/pdf/result/PdfSaveResult.java b/src/main/java/notai/pdf/result/PdfSaveResult.java new file mode 100644 index 0000000..340d4af --- /dev/null +++ b/src/main/java/notai/pdf/result/PdfSaveResult.java @@ -0,0 +1,19 @@ +package notai.pdf.result; + +import java.io.File; + +public record PdfSaveResult( + String pdfName, + String pdfUrl, + File pdf +) { + public static PdfSaveResult of( + String pdfName, File pdf + ) { + return new PdfSaveResult(pdfName, convertPdfUrl(pdfName), pdf); + } + + private static String convertPdfUrl(String pdfName) { + return String.format("pdf/%s", pdfName); + } +} diff --git a/src/test/java/notai/document/application/DocumentServiceTest.java b/src/test/java/notai/document/application/DocumentServiceTest.java deleted file mode 100644 index 6b82bfd..0000000 --- a/src/test/java/notai/document/application/DocumentServiceTest.java +++ /dev/null @@ -1,13 +0,0 @@ -package notai.document.application; - -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.InjectMocks; -import org.mockito.junit.jupiter.MockitoExtension; - -@ExtendWith(MockitoExtension.class) -class DocumentServiceTest { - - @InjectMocks - PdfService pdfService; - -} diff --git a/src/test/java/notai/document/application/PdfServiceTest.java b/src/test/java/notai/document/application/PdfServiceTest.java deleted file mode 100644 index 70f2397..0000000 --- a/src/test/java/notai/document/application/PdfServiceTest.java +++ /dev/null @@ -1,73 +0,0 @@ -package notai.document.application; - -import net.sourceforge.tess4j.Tesseract; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.assertj.core.api.Assertions; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.InjectMocks; -import org.mockito.junit.jupiter.MockitoExtension; -import org.springframework.core.io.ClassPathResource; -import org.springframework.mock.web.MockMultipartFile; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; - -@ExtendWith(MockitoExtension.class) -class PdfServiceTest { - - @InjectMocks - PdfService pdfService; - - static final String STORAGE_DIR = "src/main/resources/pdf/"; - - @Test - void savePdf_success_existsTestPdf() throws IOException { - //given - ClassPathResource existsPdf = new ClassPathResource("pdf/test.pdf"); - MockMultipartFile mockFile = new MockMultipartFile("file", - existsPdf.getFilename(), - "application/pdf", - Files.readAllBytes(existsPdf.getFile().toPath()) - ); - //when - String savedFileName = pdfService.savePdf(mockFile); - //then - Path savedFilePath = Paths.get(STORAGE_DIR, savedFileName); - Assertions.assertThat(Files.exists(savedFilePath)).isTrue(); - - System.setProperty("jna.library.path", "/usr/local/opt/tesseract/lib/"); - //window, mac -> brew install tesseract, tesseract-lang - Tesseract tesseract = new Tesseract(); - - tesseract.setDatapath("/usr/local/share/tessdata"); - tesseract.setLanguage("kor+eng"); - - try { - PDDocument pdDocument = Loader.loadPDF(savedFilePath.toFile()); - PDFRenderer pdfRenderer = new PDFRenderer(pdDocument); - - var image = pdfRenderer.renderImage(9); - var start = System.currentTimeMillis(); - var ocrResult = tesseract.doOCR(image); - System.out.println("result : " + ocrResult); - var end = System.currentTimeMillis(); - System.out.println(end - start); - pdDocument.close(); - } catch (Exception e) { - e.printStackTrace(); - } - - deleteFile(savedFilePath); - } - - void deleteFile(Path filePath) throws IOException { - if (Files.exists(filePath)) { - Files.delete(filePath); - } - } -} diff --git a/src/test/java/notai/ocr/application/OCRServiceTest.java b/src/test/java/notai/ocr/application/OCRServiceTest.java new file mode 100644 index 0000000..50c0b38 --- /dev/null +++ b/src/test/java/notai/ocr/application/OCRServiceTest.java @@ -0,0 +1,49 @@ +package notai.ocr.application; + +import notai.document.domain.Document; +import notai.ocr.domain.OCR; +import notai.ocr.domain.OCRRepository; +import notai.pdf.result.PdfSaveResult; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import static org.mockito.ArgumentMatchers.any; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import static org.mockito.Mockito.*; +import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.core.io.ClassPathResource; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +@ExtendWith(MockitoExtension.class) +class OCRServiceTest { + + @InjectMocks + OCRService ocrService; + @Mock + OCRRepository ocrRepository; + + @Test + void savePdf_success_existsTestPdf() throws IOException { + //given + Document document = mock(Document.class); + OCR ocr = mock(OCR.class); + ClassPathResource existsPdf = new ClassPathResource("pdf/test.pdf"); + PdfSaveResult saveResult = PdfSaveResult.of("test.pdf", existsPdf.getFile()); + when(ocrRepository.save(any(OCR.class))).thenReturn(ocr); + //when + ocrService.saveOCR(document, saveResult.pdf()); + //then + verify(ocrRepository, times(43)).save(any(OCR.class)); + + deleteFile(saveResult.pdf().toPath()); + } + + void deleteFile(Path filePath) throws IOException { + if (Files.exists(filePath)) { + Files.delete(filePath); + } + } +}