Skip to content

Commit

Permalink
Feat: PDF 처리 기능 및 OCR 기능 (#19)
Browse files Browse the repository at this point in the history
* chore: OCR 도메인 생성

도메인 생성

* feat: ID를 사용하여 엔티티 꺼내오는 작업 수행

엔티티 꺼내오는 작업 수행

* rename: PDF 관련 파일 이동

파일 이동

* feat: 문서 저장시 OCR 작업수행하는 기능

OCR 자동 작업 기능 구현

* refactor: PDF 저장 로직 개선

저장 로직 개선

* test: OCR 기능 작동 여부 테스트코드 작성

테스트 코드 작성

* refactor: 상수값 static 으로 따로 관리

상수값 관리하도록 피드백 반영

* feat: AOP 를 활용하여 Get요청시 ok를 바로 보내줄 수 있도록 기능 구현

AOP 활용 래퍼클래스 개발

* remove: 상의 후 도입할지 정해야하기때문에 우선 삭제
  • Loading branch information
yunjunghun0116 authored and hynseoj committed Oct 11, 2024
1 parent 751e5cd commit 311db97
Show file tree
Hide file tree
Showing 11 changed files with 205 additions and 106 deletions.
34 changes: 21 additions & 13 deletions src/main/java/notai/document/application/DocumentService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import notai.document.presentation.request.DocumentUpdateRequest;
import notai.folder.domain.Folder;
import notai.folder.domain.FolderRepository;
import notai.ocr.application.OCRService;
import notai.pdf.PdfService;
import notai.pdf.result.PdfSaveResult;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

Expand All @@ -17,28 +20,26 @@
public class DocumentService {

private final PdfService pdfService;
private final OCRService ocrService;
private final DocumentRepository documentRepository;
private final FolderRepository folderRepository;

public DocumentSaveResult saveDocument(
Long folderId, MultipartFile pdfFile, DocumentSaveRequest documentSaveRequest
) {
String pdfName = pdfService.savePdf(pdfFile);
String pdfUrl = convertPdfUrl(pdfName);
Folder folder = folderRepository.getById(folderId);
Document document = new Document(folder, documentSaveRequest.name(), pdfUrl);
Document savedDocument = documentRepository.save(document);
return DocumentSaveResult.of(savedDocument.getId(), savedDocument.getName(), savedDocument.getUrl());
PdfSaveResult pdfSaveResult = pdfService.savePdf(pdfFile);
Document document = saveAndReturnDocument(folderId, documentSaveRequest, pdfSaveResult.pdfUrl());
ocrService.saveOCR(document, pdfSaveResult.pdf());
return DocumentSaveResult.of(document.getId(), document.getName(), document.getUrl());
}

public DocumentSaveResult saveRootDocument(
MultipartFile pdfFile, DocumentSaveRequest documentSaveRequest
) {
String pdfName = pdfService.savePdf(pdfFile);
String pdfUrl = convertPdfUrl(pdfName);
Document document = new Document(documentSaveRequest.name(), pdfUrl);
Document savedDocument = documentRepository.save(document);
return DocumentSaveResult.of(savedDocument.getId(), savedDocument.getName(), savedDocument.getUrl());
PdfSaveResult pdfSaveResult = pdfService.savePdf(pdfFile);
Document document = saveAndReturnRootDocument(documentSaveRequest, pdfSaveResult.pdfUrl());
ocrService.saveOCR(document, pdfSaveResult.pdf());
return DocumentSaveResult.of(document.getId(), document.getName(), document.getUrl());
}

public DocumentUpdateResult updateDocument(
Expand All @@ -65,7 +66,14 @@ public void deleteAllByFolder(
documentRepository.deleteAllByFolder(folder);
}

private String convertPdfUrl(String pdfName) {
return String.format("pdf/%s", pdfName);
private Document saveAndReturnDocument(Long folderId, DocumentSaveRequest documentSaveRequest, String pdfUrl) {
Folder folder = folderRepository.getById(folderId);
Document document = new Document(folder, documentSaveRequest.name(), pdfUrl);
return documentRepository.save(document);
}

private Document saveAndReturnRootDocument(DocumentSaveRequest documentSaveRequest, String pdfUrl) {
Document document = new Document(documentSaveRequest.name(), pdfUrl);
return documentRepository.save(document);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,23 @@ public class DocumentController {
private final DocumentService documentService;
private final DocumentQueryService documentQueryService;

private static final Long ROOT_FOLDER_ID = -1L;
private static final String FOLDER_URL_FORMAT = "/api/folders/%s/documents/%s";

@PostMapping(consumes = {MediaType.MULTIPART_FORM_DATA_VALUE})
public ResponseEntity<DocumentSaveResponse> saveDocument(
@PathVariable Long folderId,
@RequestPart MultipartFile pdfFile,
@RequestPart DocumentSaveRequest documentSaveRequest
) {
DocumentSaveResult documentSaveResult;
if (folderId.equals(-1L)) {
if (folderId.equals(ROOT_FOLDER_ID)) {
documentSaveResult = documentService.saveRootDocument(pdfFile, documentSaveRequest);
} else {
documentSaveResult = documentService.saveDocument(folderId, pdfFile, documentSaveRequest);
}
DocumentSaveResponse response = DocumentSaveResponse.from(documentSaveResult);
String url = String.format("/api/folders/%s/documents/%s", folderId, response.id());
String url = String.format(FOLDER_URL_FORMAT, folderId, response.id());
return ResponseEntity.created(URI.create(url)).body(response);
}

Expand Down
50 changes: 50 additions & 0 deletions src/main/java/notai/ocr/application/OCRService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package notai.ocr.application;

import lombok.RequiredArgsConstructor;
import net.sourceforge.tess4j.Tesseract;
import notai.common.exception.type.FileProcessException;
import notai.document.domain.Document;
import notai.ocr.domain.OCR;
import notai.ocr.domain.OCRRepository;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;

import java.awt.image.BufferedImage;
import java.io.File;

@Service
@RequiredArgsConstructor
public class OCRService {

private final OCRRepository ocrRepository;

@Async
public void saveOCR(
Document document, File pdfFile
) {
try {
System.setProperty("jna.library.path", "/usr/local/opt/tesseract/lib/");
//window, mac -> brew install tesseract, tesseract-lang
Tesseract tesseract = new Tesseract();

tesseract.setDatapath("/usr/local/share/tessdata");
tesseract.setLanguage("kor+eng");

PDDocument pdDocument = Loader.loadPDF(pdfFile);
PDFRenderer pdfRenderer = new PDFRenderer(pdDocument);
for (int i = 0; i < pdDocument.getNumberOfPages(); i++) {
BufferedImage image = pdfRenderer.renderImage(i);
String ocrResult = tesseract.doOCR(image);
OCR ocr = new OCR(document, i + 1, ocrResult);
ocrRepository.save(ocr);
}

pdDocument.close();
} catch (Exception e) {
throw new FileProcessException("PDF 파일을 통해 OCR 작업을 수행하는데 실패했습니다.");
}
}
}
39 changes: 39 additions & 0 deletions src/main/java/notai/ocr/domain/OCR.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package notai.ocr.domain;

import jakarta.persistence.*;
import static jakarta.persistence.GenerationType.IDENTITY;
import jakarta.validation.constraints.NotNull;
import static lombok.AccessLevel.PROTECTED;
import lombok.Getter;
import lombok.NoArgsConstructor;
import notai.common.domain.RootEntity;
import notai.document.domain.Document;

@Entity
@Table(name = "ocr")
@Getter
@NoArgsConstructor(access = PROTECTED)
public class OCR extends RootEntity<Long> {

@Id
@GeneratedValue(strategy = IDENTITY)
private Long id;

@ManyToOne(fetch = FetchType.LAZY)
@JoinColumn(name = "document_id", referencedColumnName = "id")
private Document document;

@NotNull
@Column(name = "page_number")
private Integer pageNumber;

@NotNull
@Column(name = "content", length = 255)
private String content;

public OCR(Document document, Integer pageNumber, String content) {
this.document = document;
this.pageNumber = pageNumber;
this.content = content;
}
}
17 changes: 17 additions & 0 deletions src/main/java/notai/ocr/domain/OCRRepository.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package notai.ocr.domain;

import notai.common.exception.type.NotFoundException;
import notai.document.domain.Document;
import org.springframework.data.jpa.repository.JpaRepository;

import java.util.List;

public interface OCRRepository extends JpaRepository<OCR, Long> {
default OCR getById(Long id) {
return findById(id).orElseThrow(() -> new NotFoundException("OCR 데이터를 찾을 수 없습니다."));
}

List<OCR> findAllByDocumentId(Long documentId);

void deleteAllByDocument(Document document);
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package notai.document.presentation;
package notai.pdf;

import lombok.RequiredArgsConstructor;
import notai.document.application.PdfService;
import org.springframework.core.io.FileSystemResource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package notai.document.application;
package notai.pdf;

import lombok.RequiredArgsConstructor;
import notai.common.exception.type.FileProcessException;
import notai.common.exception.type.NotFoundException;
import notai.pdf.result.PdfSaveResult;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

Expand All @@ -19,7 +20,7 @@ public class PdfService {

private static final String STORAGE_DIR = "src/main/resources/pdf/";

public String savePdf(MultipartFile file) {
public PdfSaveResult savePdf(MultipartFile file) {
try {
Path directoryPath = Paths.get(STORAGE_DIR);
if (!Files.exists(directoryPath)) {
Expand All @@ -30,7 +31,7 @@ public String savePdf(MultipartFile file) {
Path filePath = directoryPath.resolve(fileName);
file.transferTo(filePath.toFile());

return fileName;
return PdfSaveResult.of(fileName, filePath.toFile());
} catch (IOException exception) {
throw new FileProcessException("자료를 저장하는 과정에서 에러가 발생했습니다.");
}
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/notai/pdf/result/PdfSaveResult.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package notai.pdf.result;

import java.io.File;

public record PdfSaveResult(
String pdfName,
String pdfUrl,
File pdf
) {
public static PdfSaveResult of(
String pdfName, File pdf
) {
return new PdfSaveResult(pdfName, convertPdfUrl(pdfName), pdf);
}

private static String convertPdfUrl(String pdfName) {
return String.format("pdf/%s", pdfName);
}
}
13 changes: 0 additions & 13 deletions src/test/java/notai/document/application/DocumentServiceTest.java

This file was deleted.

73 changes: 0 additions & 73 deletions src/test/java/notai/document/application/PdfServiceTest.java

This file was deleted.

49 changes: 49 additions & 0 deletions src/test/java/notai/ocr/application/OCRServiceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package notai.ocr.application;

import notai.document.domain.Document;
import notai.ocr.domain.OCR;
import notai.ocr.domain.OCRRepository;
import notai.pdf.result.PdfSaveResult;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import static org.mockito.ArgumentMatchers.any;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import static org.mockito.Mockito.*;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.core.io.ClassPathResource;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

@ExtendWith(MockitoExtension.class)
class OCRServiceTest {

@InjectMocks
OCRService ocrService;
@Mock
OCRRepository ocrRepository;

@Test
void savePdf_success_existsTestPdf() throws IOException {
//given
Document document = mock(Document.class);
OCR ocr = mock(OCR.class);
ClassPathResource existsPdf = new ClassPathResource("pdf/test.pdf");
PdfSaveResult saveResult = PdfSaveResult.of("test.pdf", existsPdf.getFile());
when(ocrRepository.save(any(OCR.class))).thenReturn(ocr);
//when
ocrService.saveOCR(document, saveResult.pdf());
//then
verify(ocrRepository, times(43)).save(any(OCR.class));

deleteFile(saveResult.pdf().toPath());
}

void deleteFile(Path filePath) throws IOException {
if (Files.exists(filePath)) {
Files.delete(filePath);
}
}
}

0 comments on commit 311db97

Please sign in to comment.