Skip to content

Commit

Permalink
add more utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
mhd-hi committed Mar 13, 2024
1 parent 24ff507 commit 7e8c3f9
Show file tree
Hide file tree
Showing 12 changed files with 109 additions and 92 deletions.
2 changes: 1 addition & 1 deletion src/app.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { PdfController } from './pdf/pdf.controller';
import { ConfigModule } from '@nestjs/config';
import config from './config/configuration';
import { HttpModule } from '@nestjs/axios';
import { FileUtil } from './utils/pdf/fileUtils';
import { FileUtil } from './utils/pdf/fileUtil';

@Module({
imports: [
Expand Down
1 change: 0 additions & 1 deletion src/config/configuration.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import * as path from 'path';

export default () => ({
port: parseInt(process.env.PORT) || 3000,
pdfOutputPath: path.resolve(__dirname, '../../test/pdf/output'),
});
8 changes: 1 addition & 7 deletions src/pdf/pdf-parser/horaire/HoraireCours.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
import { CourseCodeValidationPipe } from '../../pipes/course-code-validation-pipe';
import { Group } from './Group';
import { Period } from './Period';

interface IHoraireCours {
code: string;
title: string;
prerequisites: string;
groups: { [groupNumber: string]: Group };
}
import { IHoraireCours } from './horaire-cours.types';

export class HoraireCours implements IHoraireCours {
private static readonly TITLE_FONT_SIZE = 10.998999999999999;
Expand Down
61 changes: 25 additions & 36 deletions src/pdf/pdf-parser/horaire/horaire-cours.service.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
import PDFParser, { Output, Page, Text } from 'pdf2json';
import { Output, Page, Text } from 'pdf2json';
import { Injectable } from '@nestjs/common';
import { HttpService } from '@nestjs/axios';
import { FileUtil } from '../../../utils/pdf/fileUtils';
import { FileUtil } from '../../../utils/pdf/fileUtil';
import { firstValueFrom } from 'rxjs';
import { HoraireCours } from './HoraireCours';
import { Period } from './Period';
import { Group } from './Group';
import { TextExtractor } from '../../../utils/pdf/parser/textExtractorUtil';
import { PdfParserUtil } from '../../../utils/pdf/parser/pdfParserUtil';

/**
* private processPdfData(
err: null | Error,
pdfData: any,
): PlanificationCourse[] {
*
* Je penses que cette fonction pourrait être déplacé vers un fichier utilitaire qui serait appeler par le service,
* cela créerait un segregation du code en deux portions :
*
* 1. parsing du input data (le fichier util),
* 2. gestion et maintenance des données (dans le service)
*/

@Injectable()
export class HoraireCoursService {
Expand All @@ -14,10 +29,7 @@ export class HoraireCoursService {
private readonly START_PAGE_CONTENT_Y_AXIS = 14.019;
private readonly END_PAGE_CONTENT_Y_AXIS = 59;

constructor(
private httpService: HttpService,
private fileUtil: FileUtil,
) {}
constructor(private httpService: HttpService) {}

public async parsePdfFromUrl(pdfUrl: string) {
try {
Expand All @@ -34,24 +46,10 @@ export class HoraireCoursService {
private async parseHoraireCoursPdf(
pdfBuffer: Buffer,
): Promise<HoraireCours[]> {
const parser = new PDFParser();
return new Promise((resolve, reject) => {
parser.on('pdfParser_dataError', (errData: string) =>
console.error(errData),
);
parser.on('pdfParser_dataReady', async (pdfData) => {
try {
console.info('Parsing PDF...');
await this.fileUtil.writeDataToFile(pdfData, 'inputHoraire.json');
const courses = this.processPdfData(pdfData);
await this.fileUtil.writeDataToFile(courses, 'coursesHoraire.json');
resolve(courses);
} catch (error) {
reject(error);
}
});
parser.parseBuffer(pdfBuffer);
});
return PdfParserUtil.parsePdfBuffer(
pdfBuffer,
this.processPdfData.bind(this),
);
}

// Processes the raw PDF data to extract course information
Expand All @@ -69,12 +67,12 @@ export class HoraireCoursService {
const {
textContent: text,
fontSize,
bold,
xPos,
yPos,
} = this.extractTextDetails(textItem);
bold,
} = TextExtractor.extractTextDetails(textItem);

if (!text || yPos > this.END_PAGE_CONTENT_Y_AXIS || bold) return; //Les cells du header (COURS, ...) sont en gras
if (!text || yPos > this.END_PAGE_CONTENT_Y_AXIS || bold) return;

if (HoraireCours.isCourseCode(text, xPos)) {
// Finalize the last group of the current course if necessary
Expand Down Expand Up @@ -143,13 +141,4 @@ export class HoraireCoursService {
throw new Error('Error processing PDF data');
}
}

private extractTextDetails(textItem: Text) {
const textContent = decodeURIComponent(textItem.R[0].T).trim();
const fontSize = textItem.R[0].TS[1];
const bold = textItem.R[0].TS[2];
const xPos = textItem.x;
const yPos = textItem.y;
return { textContent, fontSize, bold, xPos, yPos };
}
}
8 changes: 8 additions & 0 deletions src/pdf/pdf-parser/horaire/horaire-cours.types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { Group } from './Group';

export interface IHoraireCours {
code: string;
title: string;
prerequisites: string;
groups: { [groupNumber: string]: Group };
}
2 changes: 1 addition & 1 deletion src/pdf/pdf-parser/planification/Row.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export class Row {
this.endX = this.truncateToFiveDecimals(endX);
}

private truncateToFiveDecimals(num) {
private truncateToFiveDecimals(num: number) {
return Math.floor(num * 100000) / 100000;
}

Expand Down
36 changes: 9 additions & 27 deletions src/pdf/pdf-parser/planification/planification-cours.service.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import PDFParser, { Output, Fill, Page, Text } from 'pdf2json';
import { HttpService } from '@nestjs/axios';
import { FileUtil } from '../../../utils/pdf/fileUtils';
import { FileUtil } from '../../../utils/pdf/fileUtil';
import { Injectable } from '@nestjs/common';
import { firstValueFrom } from 'rxjs';
import { CourseCodeValidationPipe } from '../../pipes/course-code-validation-pipe';
import { Row } from './Row';
import { PlanificationCours } from './planification-cours.types';
import { TextExtractor } from '../../../utils/pdf/parser/textExtractorUtil';
import { PdfParserUtil } from '../../../utils/pdf/parser/pdfParserUtil';

//TODO Add title to the course (cycles superieurs ont plusieurs cours avec le meme code)
//https://horaire.etsmtl.ca/Horairepublication/Planification-CyclesSuperieurs.pdf
Expand Down Expand Up @@ -36,31 +38,10 @@ export class PlanificationCoursService {
private parsePlanificationCoursPdf(
pdfBuffer: Buffer,
): Promise<PlanificationCours[]> {
const parser = new PDFParser(this, 1);

return new Promise((resolve, reject) => {
parser.on('pdfParser_dataError', (errData: string) =>
console.error(errData),
);
parser.on('pdfParser_dataReady', async (pdfData: Output) => {
try {
await this.fileUtil.writeDataToFile(
pdfData,
'inputPlanification.json',
);
const courses = this.processPdfData(pdfData);
await this.fileUtil.writeDataToFile(
courses,
'coursesPlanification.json',
);
resolve(courses);
} catch (error) {
console.error('Error parsing pdf data: ' + error);
reject(error);
}
});
parser.parseBuffer(pdfBuffer);
});
return PdfParserUtil.parsePdfBuffer(
pdfBuffer,
this.processPdfData.bind(this),
);
}

private processPdfData(pdfData: Output): PlanificationCours[] {
Expand All @@ -73,7 +54,8 @@ export class PlanificationCoursService {
pdfData.Pages.forEach((page: Page) => {
page.Texts.forEach((textItem: Text) => {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { textContent, xPos, yPos } = this.extractTextDetails(textItem); //TODO Check yPos later
const { textContent, xPos, yPos } =
TextExtractor.extractTextDetails(textItem); //TODO Check yPos later

const currentColumn = Row.getColumnHeaderName(headerCells, xPos);
// Process course code
Expand Down
4 changes: 2 additions & 2 deletions src/pdf/pdf.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { HoraireCoursService } from './pdf-parser/horaire/horaire-cours.service'
import { PlanificationCoursService } from './pdf-parser/planification/planification-cours.service';
import { isValidUrl } from '../utils/url/urlUtils';
import { PlanificationCours } from './pdf-parser/planification/planification-cours.types';
import { HoraireCours } from './pdf-parser/horaire/horaire-cours.types';
import { IHoraireCours } from './pdf-parser/horaire/horaire-cours.types';

@Controller('pdf')
export class PdfController {
Expand All @@ -22,7 +22,7 @@ export class PdfController {
@Get('parseHoraireCoursPdf')
async parseHoraireCoursPdf(
@Query('pdfUrl') pdfUrl: string,
): Promise<HoraireCours[]> {
): Promise<IHoraireCours[]> {
try {
console.log('Controller file', pdfUrl);
if (!pdfUrl || !isValidUrl(pdfUrl)) {
Expand Down
8 changes: 5 additions & 3 deletions src/utils/pdf/fileUtils.ts → src/utils/pdf/fileUtil.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { ConfigService } from '@nestjs/config';
export class FileUtil {
constructor(private configService: ConfigService) {}

writeDataToFile(data: any, fileName: string): Promise<string | null> {
public writeDataToFile(data: any, fileName: string): Promise<string | null> {
const pdfOutputPath =
this.configService.get<string>('pdfOutputPath') ||
path.join(__dirname, fileName);
Expand All @@ -34,8 +34,10 @@ export class FileUtil {
console.error('Error encountered while writing file: ', err);
reject(err);
} else {
console.log(`File "${fileName}" successfully written`);
resolve(data);
console.log(
`File "${fileName}" successfully written to "${filePath}"`,
);
resolve(filePath);
}
},
);
Expand Down
24 changes: 24 additions & 0 deletions src/utils/pdf/parser/pdfParserUtil.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import PDFParser, { Output } from 'pdf2json';
import { TextExtractor } from './textExtractorUtil';

export class PdfParserUtil {
public static async parsePdfBuffer(
pdfBuffer: Buffer,
processData: (pdfData: Output) => any,
): Promise<any> {
const parser = new PDFParser();

return new Promise((resolve, reject) => {
parser.on('pdfParser_dataError', (errData) => console.error(errData));
parser.on('pdfParser_dataReady', async (pdfData) => {
try {
const result = await processData(pdfData);
resolve(result);
} catch (error) {
reject(error);
}
});
parser.parseBuffer(pdfBuffer);
});
}
}
19 changes: 19 additions & 0 deletions src/utils/pdf/parser/textExtractorUtil.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { Text } from 'pdf2json';

export class TextExtractor {
public static extractTextDetails(textItem: Text): {
fontSize: number;
textContent: string;
xPos: number;
yPos: number;
bold: boolean;
} {
const textContent: string = decodeURIComponent(textItem.R[0].T).trim();
const fontSize: number = textItem.R[0].TS[1];
const xPos: number = textItem.x;
const yPos: number = textItem.y;
const bold: boolean = textItem.R[0].TS[2] === 1;

return { textContent, fontSize, xPos, yPos, bold };
}
}
Loading

0 comments on commit 7e8c3f9

Please sign in to comment.