Skip to content

Commit a757b36

Browse files
committed
v 0.0.15
Signed-off-by: Avishrant Sharma <avishrants@gmail.com>
1 parent 1665e07 commit a757b36

File tree

6 files changed

+110
-26
lines changed

6 files changed

+110
-26
lines changed

CHANGELOG

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
Change Log
22
==========
3-
0.0.13 (13/07/2021)
3+
0.0.15 (18/07/2021)
44
------------------
5-
- Errors are now part of scan_result and not raised (except Threshold Value Error)
6-
- Removed mulithreading and go routines
7-
- Add unittests
5+
- Add experimental parameter 'use_buffer' for large files.
6+
- Generate License mapping compaitble with Scancode.io
7+
- Update BabelStoneIDS license category
8+
- Add max_size parameter to stop scan

README.md

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,71 @@
1-
# LicensePackage
2-
A Python module to find valid copyright and license expressions in a file.
3-
This module is based on Google LicenseClassifier.
1+
# Golicense-Classifier
2+
A Python based module to find valid copyright and license expressions in a file.
3+
4+
_Note: This module is based on Google LicenseClassifier._
5+
6+
## Installation
7+
Currently, this package only supports Linux Platform. Work is in progress for Windows and Mac.
8+
9+
To install from Pypi, use
10+
```sh
11+
pip install golicense-classifier
12+
```
13+
14+
## Usage
15+
To get started, import `LicenseClassifier` class from the module as
16+
17+
```python
18+
from LicenseClassifier.classifier import LicenseClassifier
19+
```
20+
21+
_Note: Work on Copyright Statement is still in progress. Expect some issues, mostly with binary files_
22+
23+
The class comes bundled with several functions for scanning purpose.
24+
25+
1. `scan_directory`
26+
27+
This method is used to recursively walk through a directory and find license expressions and copyright statements. It returns a dictionary object with keys `header` and `files`.
28+
29+
### Usage
30+
___
31+
```python
32+
classifier = LicenseClassifier()
33+
res = classifier.scan_directory('PATH_TO_DIR')
34+
```
35+
### Optional Parameters
36+
___
37+
- `max_size`
38+
39+
Maximum size of file in MB. Default is set to 10MB. Set `max_size < 0` to ignore size constraints
40+
41+
- `use_buffer`
42+
43+
`(Experimental)` Set `True` to use buffered file scanning. `max_size` will be used as buffer size.
44+
45+
46+
2. `scan_file`
47+
48+
This method is used to find license expressions and copyright statements on a single file.
49+
50+
### Usage
51+
___
52+
```python
53+
classifier = LicenseClassifier()
54+
res = classifier.scan_file('PATH_TO_FILE')
55+
```
56+
### Optional Parameters
57+
___
58+
- `max_size`
59+
60+
Maximum size of file in MB. Default is set to 10MB. Set `max_size < 0` to ignore size constraints
61+
62+
- `use_buffer`
63+
64+
`(Experimental)` Set `True` to use buffered file scanning. `max_size` will be used as buffer size.
65+
66+
## Setting Custom Scanning Threshold
67+
68+
You can set custom threshold for scanning purpose that best suits your need. For this, you can use parameter `threshold` while making object as
69+
```python
70+
classifier = LicenseClassifier(threshold = 0.9)
71+
```

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
setuptools.setup(
77
name="golicense_classifier",
8-
version="0.0.14",
8+
version="0.0.15",
99
author="AvishrantSh (Avishrant Sharma)",
1010
author_email="<avishrants@gmail.com>",
11-
description="A Python based License Classifier based on Google License Classifier",
11+
description="A Python based License Classification and Copyright Statement Detection tool based on Google License Classifier",
1212
long_description=long_description,
1313
long_description_content_type="text/markdown",
1414
url="https://github.com/AvishrantsSh/LicensePackage",

src/LicenseClassifier/classifier.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@ class LicenseClassifier:
99
Base Class
1010
"""
1111

12-
_ROOT = os.path.dirname(__file__)
12+
__ROOT = os.path.dirname(__file__)
1313

1414
# Shared Library
15-
_so = ctypes.cdll.LoadLibrary(os.path.join(_ROOT, "compiled/libmatch.so"))
16-
_init = _so.CreateClassifier
17-
_init.argtypes = [ctypes.c_char_p, ctypes.c_double]
15+
__so = ctypes.cdll.LoadLibrary(os.path.join(__ROOT, "compiled/libmatch.so"))
16+
__init = __so.CreateClassifier
17+
__init.argtypes = [ctypes.c_char_p, ctypes.c_double]
1818

19-
_scanfile = _so.ScanFile
20-
_scanfile.argtypes = [ctypes.c_char_p, ctypes.c_int]
21-
_scanfile.restype = ctypes.c_char_p
19+
__scanfile = __so.ScanFile
20+
__scanfile.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.c_bool]
21+
__scanfile.restype = ctypes.c_char_p
2222

2323
def __init__(self, threshold: float = 0.8) -> None:
2424
"""
@@ -32,24 +32,33 @@ def __init__(self, threshold: float = 0.8) -> None:
3232
if not 0 < threshold <= 1:
3333
raise ValueError("Threshold out of bounds (0 < threshold <= 1)")
3434

35-
self._init(
36-
os.fsencode(os.path.join(LicenseClassifier._ROOT, "licenses/")), threshold
35+
self.__init(
36+
os.fsencode(os.path.join(LicenseClassifier.__ROOT, "licenses/")), threshold
3737
)
3838

39-
def scan_directory(self, location: str):
39+
def scan_directory(self, location: str, max_size=10, use_buffer=False):
4040
"""
4141
Function to find valid license and copyright expressions for files in `location`.
4242
4343
Parameters
4444
----------
4545
location : str
4646
Path to location of directory to scan.
47+
max_size : int
48+
Maximum size of file in MB. Default is set to 10MB. Set `max_size < 0` to ignore size constraints
49+
use_buffer : bool
50+
`(Experimental)` Set `True` to use buffered file scanning. `max_size` will be used as buffer size.
4751
"""
4852
result = []
4953
start_time = datetime.now(timezone.utc)
5054
for (dirpath, _, filenames) in os.walk(location):
5155
result += [
52-
self.scan_file(os.path.join(dirpath, file)) for file in filenames
56+
self.scan_file(
57+
os.path.join(dirpath, file),
58+
max_size=max_size,
59+
use_buffer=use_buffer,
60+
)
61+
for file in filenames
5362
]
5463

5564
result = sorted(result, key=lambda k: k["path"])
@@ -64,15 +73,17 @@ def scan_directory(self, location: str):
6473
"duration": (end_time - start_time).total_seconds(),
6574
"files_count": len(result),
6675
# ToDo: Add Error Expressions
67-
"errors": [],
76+
"errors": ['"' + location + '" does not exist']
77+
if not result
78+
else [],
6879
}
6980
],
7081
"files": result,
7182
}
7283

7384
return scan_result
7485

75-
def scan_file(self, location: str, max_size=50):
86+
def scan_file(self, location: str, max_size=10, use_buffer=False):
7687
"""
7788
Function to find valid license and copyright expressions in `location`.
7889
@@ -81,11 +92,15 @@ def scan_file(self, location: str, max_size=50):
8192
location : str
8293
Path to file.
8394
max_size : int
84-
Maximum size of file in MB. Default is set to 50MB. Set `max_size < 0` to ignore size constraints
95+
Maximum size of file in MB. Default is set to 10MB. Set `max_size < 0` to ignore size constraints
96+
use_buffer : bool
97+
`(Experimental)` Set `True` to use buffered file scanning. `max_size` will be used as buffer size. Recommended for large files.
8598
"""
8699
# ToDo: DS Marshalling
87100

88-
json_string = os.fsdecode(self._scanfile(os.fsencode(location), max_size))
101+
json_string = os.fsdecode(
102+
self.__scanfile(os.fsencode(location), max_size, use_buffer)
103+
)
89104

90105
scan_result = json.loads(json_string)
91106

@@ -141,7 +156,7 @@ def scan_file(self, location: str, max_size=50):
141156
"BSD-4-Clause": ["bsd-original", "Permissive"],
142157
"BSD-Protection": ["bsd-protection", "Copyleft"],
143158
"BSL-1.0": ["boost-1.0", "Permissive"],
144-
"BabelstoneIDS": ["BabelstoneIDS", ""],
159+
"BabelstoneIDS": ["BabelstoneIDS", "Public Domain"],
145160
"Beerware": ["beerware", "Permissive"],
146161
"BitTorrent-1.1": ["bittorrent-1.1", "Copyleft Limited"],
147162
"Business-Source-License-1.1": ["bsl-1.1", "Source-available"],

src/LicenseClassifier/compiled/libmatch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ extern "C" {
6969
#endif
7070

7171
extern void CreateClassifier(char* license, GoFloat64 defaultThreshold);
72-
extern char* ScanFile(char* fpaths, GoInt maxSize);
72+
extern char* ScanFile(char* fpaths, GoInt maxSize, GoUint8 useBuffer);
7373

7474
#ifdef __cplusplus
7575
}
10.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)